From cfc65dd57967f2e0c7b3a8b73e6d12470b1cf1c1 Mon Sep 17 00:00:00 2001 From: Alex Williamson Date: Thu, 30 Jul 2009 16:15:18 -0600 Subject: iommu=pt is a valid early param This avoids a "Malformed early option 'iommu'" warning on boot when trying to use pass-through mode. Signed-off-by: Alex Williamson Signed-off-by: David Woodhouse --- arch/x86/kernel/pci-dma.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c index 1a041bcf506b..ae13e34f7248 100644 --- a/arch/x86/kernel/pci-dma.c +++ b/arch/x86/kernel/pci-dma.c @@ -212,10 +212,8 @@ static __init int iommu_setup(char *p) if (!strncmp(p, "soft", 4)) swiotlb = 1; #endif - if (!strncmp(p, "pt", 2)) { + if (!strncmp(p, "pt", 2)) iommu_pass_through = 1; - return 1; - } gart_parse_options(p); -- cgit v1.2.3 From 19943b0e30b05d42e494ae6fef78156ebc8c637e Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Tue, 4 Aug 2009 16:19:20 +0100 Subject: intel-iommu: Unify hardware and software passthrough support This makes the hardware passthrough mode work a lot more like the software version, so that the behaviour of a kernel with 'iommu=pt' is the same whether the hardware supports passthrough or not. In particular: - We use a single si_domain for the pass-through devices. - 32-bit devices can be taken out of the pass-through domain so that they don't have to use swiotlb. - Devices will work again after being removed from a KVM guest. - A potential oops on OOM (in init_context_pass_through()) is fixed. Signed-off-by: David Woodhouse --- arch/x86/kernel/pci-swiotlb.c | 5 +- drivers/pci/intel-iommu.c | 174 ++++++++++++++++++------------------------ 2 files changed, 76 insertions(+), 103 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/pci-swiotlb.c b/arch/x86/kernel/pci-swiotlb.c index 6af96ee44200..1e66b18f45cb 100644 --- a/arch/x86/kernel/pci-swiotlb.c +++ b/arch/x86/kernel/pci-swiotlb.c @@ -71,9 +71,8 @@ void __init pci_swiotlb_init(void) { /* don't initialize swiotlb if iommu=off (no_iommu=1) */ #ifdef CONFIG_X86_64 - if ((!iommu_detected && !no_iommu && max_pfn > MAX_DMA32_PFN) || - iommu_pass_through) - swiotlb = 1; + if ((!iommu_detected && !no_iommu && max_pfn > MAX_DMA32_PFN)) + swiotlb = 1; #endif if (swiotlb_force) swiotlb = 1; diff --git a/drivers/pci/intel-iommu.c b/drivers/pci/intel-iommu.c index 097d5da2fae1..147b3b960b61 100644 --- a/drivers/pci/intel-iommu.c +++ b/drivers/pci/intel-iommu.c @@ -251,7 +251,8 @@ static inline int first_pte_in_page(struct dma_pte *pte) * 2. It maps to each iommu if successful. * 3. Each iommu mapps to this domain if successful. */ -struct dmar_domain *si_domain; +static struct dmar_domain *si_domain; +static int hw_pass_through = 1; /* devices under the same p2p bridge are owned in one domain */ #define DOMAIN_FLAG_P2P_MULTIPLE_DEVICES (1 << 0) @@ -1948,14 +1949,24 @@ static int iommu_prepare_identity_map(struct pci_dev *pdev, struct dmar_domain *domain; int ret; - printk(KERN_INFO - "IOMMU: Setting identity map for device %s [0x%Lx - 0x%Lx]\n", - pci_name(pdev), start, end); - domain = get_domain_for_dev(pdev, DEFAULT_DOMAIN_ADDRESS_WIDTH); if (!domain) return -ENOMEM; + /* For _hardware_ passthrough, don't bother. But for software + passthrough, we do it anyway -- it may indicate a memory + range which is reserved in E820, so which didn't get set + up to start with in si_domain */ + if (domain == si_domain && hw_pass_through) { + printk("Ignoring identity map for HW passthrough device %s [0x%Lx - 0x%Lx]\n", + pci_name(pdev), start, end); + return 0; + } + + printk(KERN_INFO + "IOMMU: Setting identity map for device %s [0x%Lx - 0x%Lx]\n", + pci_name(pdev), start, end); + ret = iommu_domain_identity_map(domain, start, end); if (ret) goto error; @@ -2006,23 +2017,6 @@ static inline void iommu_prepare_isa(void) } #endif /* !CONFIG_DMAR_FLPY_WA */ -/* Initialize each context entry as pass through.*/ -static int __init init_context_pass_through(void) -{ - struct pci_dev *pdev = NULL; - struct dmar_domain *domain; - int ret; - - for_each_pci_dev(pdev) { - domain = get_domain_for_dev(pdev, DEFAULT_DOMAIN_ADDRESS_WIDTH); - ret = domain_context_mapping(domain, pdev, - CONTEXT_TT_PASS_THROUGH); - if (ret) - return ret; - } - return 0; -} - static int md_domain_init(struct dmar_domain *domain, int guest_width); static int __init si_domain_work_fn(unsigned long start_pfn, @@ -2037,7 +2031,7 @@ static int __init si_domain_work_fn(unsigned long start_pfn, } -static int si_domain_init(void) +static int si_domain_init(int hw) { struct dmar_drhd_unit *drhd; struct intel_iommu *iommu; @@ -2064,6 +2058,9 @@ static int si_domain_init(void) si_domain->flags = DOMAIN_FLAG_STATIC_IDENTITY; + if (hw) + return 0; + for_each_online_node(nid) { work_with_active_regions(nid, si_domain_work_fn, &ret); if (ret) @@ -2155,24 +2152,26 @@ static int iommu_should_identity_map(struct pci_dev *pdev, int startup) return 1; } -static int iommu_prepare_static_identity_mapping(void) +static int iommu_prepare_static_identity_mapping(int hw) { struct pci_dev *pdev = NULL; int ret; - ret = si_domain_init(); + ret = si_domain_init(hw); if (ret) return -EFAULT; for_each_pci_dev(pdev) { if (iommu_should_identity_map(pdev, 1)) { - printk(KERN_INFO "IOMMU: identity mapping for device %s\n", - pci_name(pdev)); + printk(KERN_INFO "IOMMU: %s identity mapping for device %s\n", + hw ? "hardware" : "software", pci_name(pdev)); ret = domain_context_mapping(si_domain, pdev, + hw ? CONTEXT_TT_PASS_THROUGH : CONTEXT_TT_MULTI_LEVEL); if (ret) return ret; + ret = domain_add_dev_info(si_domain, pdev); if (ret) return ret; @@ -2189,14 +2188,6 @@ int __init init_dmars(void) struct pci_dev *pdev; struct intel_iommu *iommu; int i, ret; - int pass_through = 1; - - /* - * In case pass through can not be enabled, iommu tries to use identity - * mapping. - */ - if (iommu_pass_through) - iommu_identity_mapping = 1; /* * for each drhd @@ -2250,14 +2241,8 @@ int __init init_dmars(void) goto error; } if (!ecap_pass_through(iommu->ecap)) - pass_through = 0; + hw_pass_through = 0; } - if (iommu_pass_through) - if (!pass_through) { - printk(KERN_INFO - "Pass Through is not supported by hardware.\n"); - iommu_pass_through = 0; - } /* * Start from the sane iommu hardware state. @@ -2312,64 +2297,57 @@ int __init init_dmars(void) } } + if (iommu_pass_through) + iommu_identity_mapping = 1; +#ifdef CONFIG_DMAR_BROKEN_GFX_WA + else + iommu_identity_mapping = 2; +#endif /* - * If pass through is set and enabled, context entries of all pci - * devices are intialized by pass through translation type. + * If pass through is not set or not enabled, setup context entries for + * identity mappings for rmrr, gfx, and isa and may fall back to static + * identity mapping if iommu_identity_mapping is set. */ - if (iommu_pass_through) { - ret = init_context_pass_through(); + if (iommu_identity_mapping) { + ret = iommu_prepare_static_identity_mapping(hw_pass_through); if (ret) { - printk(KERN_ERR "IOMMU: Pass through init failed.\n"); - iommu_pass_through = 0; + printk(KERN_CRIT "Failed to setup IOMMU pass-through\n"); + goto error; } } - /* - * If pass through is not set or not enabled, setup context entries for - * identity mappings for rmrr, gfx, and isa and may fall back to static - * identity mapping if iommu_identity_mapping is set. + * For each rmrr + * for each dev attached to rmrr + * do + * locate drhd for dev, alloc domain for dev + * allocate free domain + * allocate page table entries for rmrr + * if context not allocated for bus + * allocate and init context + * set present in root table for this bus + * init context with domain, translation etc + * endfor + * endfor */ - if (!iommu_pass_through) { -#ifdef CONFIG_DMAR_BROKEN_GFX_WA - if (!iommu_identity_mapping) - iommu_identity_mapping = 2; -#endif - if (iommu_identity_mapping) - iommu_prepare_static_identity_mapping(); - /* - * For each rmrr - * for each dev attached to rmrr - * do - * locate drhd for dev, alloc domain for dev - * allocate free domain - * allocate page table entries for rmrr - * if context not allocated for bus - * allocate and init context - * set present in root table for this bus - * init context with domain, translation etc - * endfor - * endfor - */ - printk(KERN_INFO "IOMMU: Setting RMRR:\n"); - for_each_rmrr_units(rmrr) { - for (i = 0; i < rmrr->devices_cnt; i++) { - pdev = rmrr->devices[i]; - /* - * some BIOS lists non-exist devices in DMAR - * table. - */ - if (!pdev) - continue; - ret = iommu_prepare_rmrr_dev(rmrr, pdev); - if (ret) - printk(KERN_ERR - "IOMMU: mapping reserved region failed\n"); - } + printk(KERN_INFO "IOMMU: Setting RMRR:\n"); + for_each_rmrr_units(rmrr) { + for (i = 0; i < rmrr->devices_cnt; i++) { + pdev = rmrr->devices[i]; + /* + * some BIOS lists non-exist devices in DMAR + * table. + */ + if (!pdev) + continue; + ret = iommu_prepare_rmrr_dev(rmrr, pdev); + if (ret) + printk(KERN_ERR + "IOMMU: mapping reserved region failed\n"); } - - iommu_prepare_isa(); } + iommu_prepare_isa(); + /* * for each drhd * enable fault log @@ -2536,7 +2514,10 @@ static int iommu_no_mapping(struct device *dev) ret = domain_add_dev_info(si_domain, pdev); if (ret) return 0; - ret = domain_context_mapping(si_domain, pdev, CONTEXT_TT_MULTI_LEVEL); + ret = domain_context_mapping(si_domain, pdev, + hw_pass_through ? + CONTEXT_TT_PASS_THROUGH : + CONTEXT_TT_MULTI_LEVEL); if (!ret) { printk(KERN_INFO "64bit %s uses identity mapping\n", pci_name(pdev)); @@ -3202,7 +3183,7 @@ int __init intel_iommu_init(void) * Check the need for DMA-remapping initialization now. * Above initialization will also be used by Interrupt-remapping. */ - if (no_iommu || (swiotlb && !iommu_pass_through) || dmar_disabled) + if (no_iommu || swiotlb || dmar_disabled) return -ENODEV; iommu_init_mempool(); @@ -3222,14 +3203,7 @@ int __init intel_iommu_init(void) init_timer(&unmap_timer); force_iommu = 1; - - if (!iommu_pass_through) { - printk(KERN_INFO - "Multi-level page-table translation for DMAR.\n"); - dma_ops = &intel_dma_ops; - } else - printk(KERN_INFO - "DMAR: Pass through translation for DMAR.\n"); + dma_ops = &intel_dma_ops; init_iommu_sysfs(); -- cgit v1.2.3 From efafc8b213e67ed148a5b53ade29ee7b48af907d Mon Sep 17 00:00:00 2001 From: Feng Tang Date: Fri, 14 Aug 2009 15:23:29 -0400 Subject: x86: add arch-specific SFI support arch/x86/kernel/sfi.c serves the dual-purpose of supporting the SFI core with arch specific code, as well as a home for the arch-specific code that uses SFI. analogous to ACPI, drivers/sfi/Kconfig is pulled in by arch/x86/Kconfig Signed-off-by: Feng Tang Signed-off-by: Len Brown Cc: x86@kernel.org --- arch/x86/Kconfig | 2 + arch/x86/kernel/Makefile | 1 + arch/x86/kernel/setup.c | 3 ++ arch/x86/kernel/sfi.c | 133 +++++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 139 insertions(+) create mode 100644 arch/x86/kernel/sfi.c (limited to 'arch/x86/kernel') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 13ffa5df37d7..d8ba42418d33 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -1683,6 +1683,8 @@ source "kernel/power/Kconfig" source "drivers/acpi/Kconfig" +source "drivers/sfi/Kconfig" + config X86_APM_BOOT bool default y diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 430d5b24af7b..6321afaafb26 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -55,6 +55,7 @@ obj-y += step.o obj-$(CONFIG_STACKTRACE) += stacktrace.o obj-y += cpu/ obj-y += acpi/ +obj-$(CONFIG_SFI) += sfi.o obj-y += reboot.o obj-$(CONFIG_MCA) += mca_32.o obj-$(CONFIG_X86_MSR) += msr.o diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 63f32d220ef2..d784ea207606 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -27,6 +27,7 @@ #include #include #include +#include #include #include #include @@ -990,6 +991,8 @@ void __init setup_arch(char **cmdline_p) */ acpi_boot_init(); + sfi_init(); + #if defined(CONFIG_X86_MPPARSE) || defined(CONFIG_X86_VISWS) /* * get boot-time SMP configuration: diff --git a/arch/x86/kernel/sfi.c b/arch/x86/kernel/sfi.c new file mode 100644 index 000000000000..761df3f759c1 --- /dev/null +++ b/arch/x86/kernel/sfi.c @@ -0,0 +1,133 @@ +/* + * sfi.c - x86 architecture SFI support. + * + * Copyright (c) 2009, Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA. + * + */ + +#define KMSG_COMPONENT "SFI" +#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +#ifdef CONFIG_X86_LOCAL_APIC +static unsigned long sfi_lapic_addr __initdata = APIC_DEFAULT_PHYS_BASE; + +void __init mp_sfi_register_lapic_address(unsigned long address) +{ + mp_lapic_addr = address; + + set_fixmap_nocache(FIX_APIC_BASE, mp_lapic_addr); + if (boot_cpu_physical_apicid == -1U) + boot_cpu_physical_apicid = read_apic_id(); + + pr_info("Boot CPU = %d\n", boot_cpu_physical_apicid); +} + +/* All CPUs enumerated by SFI must be present and enabled */ +void __cpuinit mp_sfi_register_lapic(u8 id) +{ + int boot_cpu = 0; + + if (MAX_APICS - id <= 0) { + pr_warning("Processor #%d invalid (max %d)\n", + id, MAX_APICS); + return; + } + + if (id == boot_cpu_physical_apicid) + boot_cpu = 1; + pr_info("registering lapic[%d]\n", id); + + generic_processor_info(id, GET_APIC_VERSION(apic_read(APIC_LVR))); +} + +static int __init sfi_parse_cpus(struct sfi_table_header *table) +{ + struct sfi_table_simple *sb; + struct sfi_cpu_table_entry *pentry; + int i; + int cpu_num; + + sb = (struct sfi_table_simple *)table; + cpu_num = SFI_GET_NUM_ENTRIES(sb, struct sfi_cpu_table_entry); + pentry = (struct sfi_cpu_table_entry *)sb->pentry; + + for (i = 0; i < cpu_num; i++) { + mp_sfi_register_lapic(pentry->apic_id); + pentry++; + } + + smp_found_config = 1; + return 0; +} +#endif /* CONFIG_X86_LOCAL_APIC */ + +#ifdef CONFIG_X86_IO_APIC +static u32 gsi_base; + +static int __init sfi_parse_ioapic(struct sfi_table_header *table) +{ + struct sfi_table_simple *sb; + struct sfi_apic_table_entry *pentry; + int i, num; + + sb = (struct sfi_table_simple *)table; + num = SFI_GET_NUM_ENTRIES(sb, struct sfi_apic_table_entry); + pentry = (struct sfi_apic_table_entry *)sb->pentry; + + for (i = 0; i < num; i++) { + mp_register_ioapic(i, pentry->phys_addr, gsi_base); + gsi_base += io_apic_get_redir_entries(i); + pentry++; + } + + WARN(pic_mode, KERN_WARNING + "SFI: pic_mod shouldn't be 1 when IOAPIC table is present\n"); + pic_mode = 0; + return 0; +} +#endif /* CONFIG_X86_IO_APIC */ + +/* + * sfi_platform_init(): register lapics & io-apics + */ +int __init sfi_platform_init(void) +{ +#ifdef CONFIG_X86_LOCAL_APIC + mp_sfi_register_lapic_address(sfi_lapic_addr); + sfi_table_parse(SFI_SIG_CPUS, NULL, NULL, sfi_parse_cpus); +#endif +#ifdef CONFIG_X86_IO_APIC + sfi_table_parse(SFI_SIG_APIC, NULL, NULL, sfi_parse_ioapic); +#endif + return 0; +} -- cgit v1.2.3 From 8079ce34f2c3f5bfedcea8d4fb7290ce46ac5b56 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 31 Aug 2009 16:49:19 +0200 Subject: sfi: Remove unused code Signed-off-by: Thomas Gleixner Signed-off-by: Feng Tang Signed-off-by: Len Brown --- arch/x86/kernel/sfi.c | 4 ---- 1 file changed, 4 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/sfi.c b/arch/x86/kernel/sfi.c index 761df3f759c1..b294b7a0d4b9 100644 --- a/arch/x86/kernel/sfi.c +++ b/arch/x86/kernel/sfi.c @@ -55,16 +55,12 @@ void __init mp_sfi_register_lapic_address(unsigned long address) /* All CPUs enumerated by SFI must be present and enabled */ void __cpuinit mp_sfi_register_lapic(u8 id) { - int boot_cpu = 0; - if (MAX_APICS - id <= 0) { pr_warning("Processor #%d invalid (max %d)\n", id, MAX_APICS); return; } - if (id == boot_cpu_physical_apicid) - boot_cpu = 1; pr_info("registering lapic[%d]\n", id); generic_processor_info(id, GET_APIC_VERSION(apic_read(APIC_LVR))); -- cgit v1.2.3 From 3834f47291df475be3f0f0fb7ccaa098967cc054 Mon Sep 17 00:00:00 2001 From: Feng Tang Date: Mon, 14 Sep 2009 13:01:53 +0800 Subject: SFI: remove unneeded includes Signed-off-by: Feng Tang Signed-off-by: Len Brown --- arch/x86/kernel/sfi.c | 7 ------- drivers/sfi/sfi_core.c | 5 ----- 2 files changed, 12 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/sfi.c b/arch/x86/kernel/sfi.c index b294b7a0d4b9..34e099382651 100644 --- a/arch/x86/kernel/sfi.c +++ b/arch/x86/kernel/sfi.c @@ -21,22 +21,15 @@ #define KMSG_COMPONENT "SFI" #define pr_fmt(fmt) KMSG_COMPONENT ": " fmt -#include -#include -#include #include #include -#include -#include #include #include #include -#include #include #include #include -#include #ifdef CONFIG_X86_LOCAL_APIC static unsigned long sfi_lapic_addr __initdata = APIC_DEFAULT_PHYS_BASE; diff --git a/drivers/sfi/sfi_core.c b/drivers/sfi/sfi_core.c index 7bd6b18fdd27..d3b496800477 100644 --- a/drivers/sfi/sfi_core.c +++ b/drivers/sfi/sfi_core.c @@ -62,16 +62,11 @@ #include #include #include -#include #include #include #include #include -#include #include -#include - -#include #include "sfi_core.h" -- cgit v1.2.3 From c2777f98c205148f1a0d4f9ac03b9cb20b39b2da Mon Sep 17 00:00:00 2001 From: Daniel Walker Date: Sat, 12 Sep 2009 10:40:20 -0700 Subject: x86: apic: Convert BUG() to BUG_ON() This was done using Coccinelle's BUG_ON semantic patch. Signed-off-by: Daniel Walker Cc: Julia Lawall LKML-Reference: <1252777220-30796-1-git-send-email-dwalker@fifo99.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic/apic.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index 159740decc41..79e5b92a5800 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -1196,8 +1196,7 @@ void __cpuinit setup_local_APIC(void) * Double-check whether this APIC is really registered. * This is meaningless in clustered apic mode, so we skip it. */ - if (!apic->apic_id_registered()) - BUG(); + BUG_ON(!apic->apic_id_registered()); /* * Intel recommends to set DFR, LDR and TPR before enabling -- cgit v1.2.3 From daf7b9c9216e2b82e4c14b7248a85286dab021c3 Mon Sep 17 00:00:00 2001 From: Jack Steiner Date: Wed, 9 Sep 2009 10:43:39 -0500 Subject: x86: SGI UV: Map MMIO-High memory range UV depends on the MMRHI space being identity mapped. The patch: x86: Make 64-bit efi_ioremap use ioremap on MMIO regions changed this to make efi regions at a different address using ioremap. Add the identity mapping to uv_system_init. ( Note this code was previously present but was deleted when BIOS added the ranges to the EFI map - previous efi code identify mapped the ranges. ) Signed-off-by: Jack Steiner LKML-Reference: <20090909154339.GA7946@sgi.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic/x2apic_uv_x.c | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c index 601159374e87..f5f5886a6b53 100644 --- a/arch/x86/kernel/apic/x2apic_uv_x.c +++ b/arch/x86/kernel/apic/x2apic_uv_x.c @@ -389,6 +389,16 @@ static __init void map_gru_high(int max_pnode) map_high("GRU", gru.s.base, shift, max_pnode, map_wb); } +static __init void map_mmr_high(int max_pnode) +{ + union uvh_rh_gam_mmr_overlay_config_mmr_u mmr; + int shift = UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR_BASE_SHFT; + + mmr.v = uv_read_local_mmr(UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR); + if (mmr.s.enable) + map_high("MMR", mmr.s.base, shift, max_pnode, map_uc); +} + static __init void map_mmioh_high(int max_pnode) { union uvh_rh_gam_mmioh_overlay_config_mmr_u mmioh; @@ -643,6 +653,7 @@ void __init uv_system_init(void) } map_gru_high(max_pnode); + map_mmr_high(max_pnode); map_mmioh_high(max_pnode); uv_cpu_init(); -- cgit v1.2.3 From d223246ef7e6d73c8e3d9b58f27f2eb3fe95e25d Mon Sep 17 00:00:00 2001 From: Anders Kaseorg Date: Wed, 16 Sep 2009 16:44:26 -0400 Subject: x86: fix fragile computation of vsyscall address Previously, the address of the vsyscall page (VSYSCALL_PHYS_ADDR, VSYSCALL_VIRT_ADDR) was computed by arithmetic on the address of the last section. This leads to bugs when new sections are inserted, such as the one fixed by commit d312ceda567ab91acd756cde95ac5fbc6b40ed40. Let's compute it from the current address instead. Signed-off-by: Anders Kaseorg Signed-off-by: H. Peter Anvin --- arch/x86/kernel/vmlinux.lds.S | 19 +++++++------------ 1 file changed, 7 insertions(+), 12 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index 0ccb57d5ee35..b96ca472fa26 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -135,24 +135,21 @@ SECTIONS #ifdef CONFIG_X86_64 #define VSYSCALL_ADDR (-10*1024*1024) -#define VSYSCALL_PHYS_ADDR ((LOADADDR(.data) + SIZEOF(.data) + \ - PAGE_SIZE - 1) & ~(PAGE_SIZE - 1)) -#define VSYSCALL_VIRT_ADDR ((ADDR(.data) + SIZEOF(.data) + \ - PAGE_SIZE - 1) & ~(PAGE_SIZE - 1)) -#define VLOAD_OFFSET (VSYSCALL_ADDR - VSYSCALL_PHYS_ADDR) +#define VLOAD_OFFSET (VSYSCALL_ADDR - __vsyscall_0 + LOAD_OFFSET) #define VLOAD(x) (ADDR(x) - VLOAD_OFFSET) -#define VVIRT_OFFSET (VSYSCALL_ADDR - VSYSCALL_VIRT_ADDR) +#define VVIRT_OFFSET (VSYSCALL_ADDR - __vsyscall_0) #define VVIRT(x) (ADDR(x) - VVIRT_OFFSET) + . = ALIGN(4096); + __vsyscall_0 = .; + . = VSYSCALL_ADDR; - .vsyscall_0 : AT(VSYSCALL_PHYS_ADDR) { + .vsyscall_0 : AT(VLOAD(.vsyscall_0)) { *(.vsyscall_0) } :user - __vsyscall_0 = VSYSCALL_VIRT_ADDR; - . = ALIGN(CONFIG_X86_L1_CACHE_BYTES); .vsyscall_fn : AT(VLOAD(.vsyscall_fn)) { *(.vsyscall_fn) @@ -192,11 +189,9 @@ SECTIONS *(.vsyscall_3) } - . = VSYSCALL_VIRT_ADDR + PAGE_SIZE; + . = __vsyscall_0 + PAGE_SIZE; #undef VSYSCALL_ADDR -#undef VSYSCALL_PHYS_ADDR -#undef VSYSCALL_VIRT_ADDR #undef VLOAD_OFFSET #undef VLOAD #undef VVIRT_OFFSET -- cgit v1.2.3 From 4ae59b916d269255800d69a07ac5b0c368a417f8 Mon Sep 17 00:00:00 2001 From: Tim Abbott Date: Wed, 16 Sep 2009 16:44:28 -0400 Subject: x86: convert to use __HEAD and HEAD_TEXT macros. This has the consequence of changing the section name use for head code from ".text.head" to ".head.text". It also eliminates the ".text.head" output section (instead placing head code at the start of the .text output section), which should be harmless. This patch only changes the sections in the actual kernel, not those in the compressed boot loader. Signed-off-by: Tim Abbott Cc: Ingo Molnar Cc: Sam Ravnborg Signed-off-by: H. Peter Anvin --- arch/x86/kernel/head_32.S | 2 +- arch/x86/kernel/head_64.S | 2 +- arch/x86/kernel/vmlinux.lds.S | 12 +++--------- 3 files changed, 5 insertions(+), 11 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S index 7ffec6b3b331..64ad7696786f 100644 --- a/arch/x86/kernel/head_32.S +++ b/arch/x86/kernel/head_32.S @@ -79,7 +79,7 @@ RESERVE_BRK(pagetables, INIT_MAP_SIZE) * any particular GDT layout, because we load our own as soon as we * can. */ -.section .text.head,"ax",@progbits +__HEAD ENTRY(startup_32) /* test KEEP_SEGMENTS flag to see if the bootloader is asking us to not reload segments */ diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index fa54f78e2a05..26406601031c 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -40,7 +40,7 @@ L4_START_KERNEL = pgd_index(__START_KERNEL_map) L3_START_KERNEL = pud_index(__START_KERNEL_map) .text - .section .text.head + __HEAD .code64 .globl startup_64 startup_64: diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index b96ca472fa26..5c7826dd804b 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -65,17 +65,11 @@ SECTIONS #endif /* Text and read-only data */ - - /* bootstrapping code */ - .text.head : AT(ADDR(.text.head) - LOAD_OFFSET) { - _text = .; - *(.text.head) - } :text = 0x9090 - - /* The rest of the text */ .text : AT(ADDR(.text) - LOAD_OFFSET) { + _text = .; + /* bootstrapping code */ + HEAD_TEXT #ifdef CONFIG_X86_32 - /* not really needed, already page aligned */ . = ALIGN(PAGE_SIZE); *(.text.page_aligned) #endif -- cgit v1.2.3 From 07e81d61605f885920f31634a65aace52beb97db Mon Sep 17 00:00:00 2001 From: Tim Abbott Date: Wed, 16 Sep 2009 16:44:29 -0400 Subject: x86: Use section .data.page_aligned for the idt_table. The .data.idt section is just squashed into the .data.page_aligned output section by the linker script anyway, so it might as well be in the .data.page_aligned section. This eliminates all references to .data.idt on x86. Signed-off-by: Tim Abbott Cc: Ingo Molnar Signed-off-by: H. Peter Anvin --- arch/x86/kernel/traps.c | 6 ++---- arch/x86/kernel/vmlinux.lds.S | 1 - 2 files changed, 2 insertions(+), 5 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 83264922a878..ea23d3b76f95 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -73,11 +73,9 @@ char ignore_fpu_irq; /* * The IDT has to be page-aligned to simplify the Pentium - * F0 0F bug workaround.. We have a special link segment - * for this. + * F0 0F bug workaround. */ -gate_desc idt_table[NR_VECTORS] - __attribute__((__section__(".data.idt"))) = { { { { 0, 0 } } }, }; +gate_desc idt_table[NR_VECTORS] __page_aligned_data = { { { { 0, 0 } } }, }; #endif DECLARE_BITMAP(used_vectors, NR_VECTORS); diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index 5c7826dd804b..7d6cef363c47 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -112,7 +112,6 @@ SECTIONS #endif PAGE_ALIGNED_DATA(PAGE_SIZE) - *(.data.idt) CACHELINE_ALIGNED_DATA(CONFIG_X86_L1_CACHE_BYTES) -- cgit v1.2.3 From 123f3e1d76952759c0fc8b4fec55a3fc8084d56f Mon Sep 17 00:00:00 2001 From: Tim Abbott Date: Wed, 16 Sep 2009 16:44:30 -0400 Subject: x86: Cleanup linker script using new linker script macros. Signed-off-by: Tim Abbott Signed-off-by: H. Peter Anvin --- arch/x86/kernel/vmlinux.lds.S | 47 +++---------------------------------------- 1 file changed, 3 insertions(+), 44 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index 7d6cef363c47..8d2d0a271e23 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -88,13 +88,7 @@ SECTIONS NOTES :text :note - /* Exception table */ - . = ALIGN(16); - __ex_table : AT(ADDR(__ex_table) - LOAD_OFFSET) { - __start___ex_table = .; - *(__ex_table) - __stop___ex_table = .; - } :text = 0x9090 + EXCEPTION_TABLE(16) :text = 0x9090 RO_DATA(PAGE_SIZE) @@ -207,36 +201,12 @@ SECTIONS PERCPU_VADDR(0, :percpu) #endif - .init.text : AT(ADDR(.init.text) - LOAD_OFFSET) { - _sinittext = .; - INIT_TEXT - _einittext = .; - } + INIT_TEXT_SECTION(PAGE_SIZE) #ifdef CONFIG_X86_64 :init #endif - .init.data : AT(ADDR(.init.data) - LOAD_OFFSET) { - INIT_DATA - } - - . = ALIGN(16); - .init.setup : AT(ADDR(.init.setup) - LOAD_OFFSET) { - __setup_start = .; - *(.init.setup) - __setup_end = .; - } - .initcall.init : AT(ADDR(.initcall.init) - LOAD_OFFSET) { - __initcall_start = .; - INITCALLS - __initcall_end = .; - } - - .con_initcall.init : AT(ADDR(.con_initcall.init) - LOAD_OFFSET) { - __con_initcall_start = .; - *(.con_initcall.init) - __con_initcall_end = .; - } + INIT_DATA_SECTION(16) .x86_cpu_dev.init : AT(ADDR(.x86_cpu_dev.init) - LOAD_OFFSET) { __x86_cpu_dev_start = .; @@ -244,8 +214,6 @@ SECTIONS __x86_cpu_dev_end = .; } - SECURITY_INIT - . = ALIGN(8); .parainstructions : AT(ADDR(.parainstructions) - LOAD_OFFSET) { __parainstructions = .; @@ -276,15 +244,6 @@ SECTIONS EXIT_DATA } -#ifdef CONFIG_BLK_DEV_INITRD - . = ALIGN(PAGE_SIZE); - .init.ramfs : AT(ADDR(.init.ramfs) - LOAD_OFFSET) { - __initramfs_start = .; - *(.init.ramfs) - __initramfs_end = .; - } -#endif - #if !defined(CONFIG_X86_64) || !defined(CONFIG_SMP) PERCPU(PAGE_SIZE) #endif -- cgit v1.2.3 From 5622f295b53fb60dbf9bed3e2c89d182490a8b7f Mon Sep 17 00:00:00 2001 From: Markus Metzger Date: Tue, 15 Sep 2009 13:00:23 +0200 Subject: x86, perf_counter, bts: Optimize BTS overflow handling Draining the BTS buffer on a buffer overflow interrupt takes too long resulting in a kernel lockup when tracing the kernel. Restructure perf_counter sampling into sample creation and sample output. Prepare a single reference sample for BTS sampling and update the from and to address fields when draining the BTS buffer. Drain the entire BTS buffer between a single perf_output_begin() / perf_output_end() pair. Signed-off-by: Markus Metzger Signed-off-by: Peter Zijlstra LKML-Reference: <20090915130023.A16204@sedona.ch.intel.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 60 ++++--- include/linux/perf_counter.h | 68 +++++++- kernel/perf_counter.c | 312 ++++++++++++++++++++----------------- 3 files changed, 266 insertions(+), 174 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index f9cd0849bd42..6a0e71b38126 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -36,10 +36,10 @@ static u64 perf_counter_mask __read_mostly; #define BTS_RECORD_SIZE 24 /* The size of a per-cpu BTS buffer in bytes: */ -#define BTS_BUFFER_SIZE (BTS_RECORD_SIZE * 1024) +#define BTS_BUFFER_SIZE (BTS_RECORD_SIZE * 2048) /* The BTS overflow threshold in bytes from the end of the buffer: */ -#define BTS_OVFL_TH (BTS_RECORD_SIZE * 64) +#define BTS_OVFL_TH (BTS_RECORD_SIZE * 128) /* @@ -1488,8 +1488,7 @@ void perf_counter_print_debug(void) local_irq_restore(flags); } -static void intel_pmu_drain_bts_buffer(struct cpu_hw_counters *cpuc, - struct perf_sample_data *data) +static void intel_pmu_drain_bts_buffer(struct cpu_hw_counters *cpuc) { struct debug_store *ds = cpuc->ds; struct bts_record { @@ -1498,8 +1497,11 @@ static void intel_pmu_drain_bts_buffer(struct cpu_hw_counters *cpuc, u64 flags; }; struct perf_counter *counter = cpuc->counters[X86_PMC_IDX_FIXED_BTS]; - unsigned long orig_ip = data->regs->ip; struct bts_record *at, *top; + struct perf_output_handle handle; + struct perf_event_header header; + struct perf_sample_data data; + struct pt_regs regs; if (!counter) return; @@ -1510,19 +1512,38 @@ static void intel_pmu_drain_bts_buffer(struct cpu_hw_counters *cpuc, at = (struct bts_record *)(unsigned long)ds->bts_buffer_base; top = (struct bts_record *)(unsigned long)ds->bts_index; + if (top <= at) + return; + ds->bts_index = ds->bts_buffer_base; + + data.period = counter->hw.last_period; + data.addr = 0; + regs.ip = 0; + + /* + * Prepare a generic sample, i.e. fill in the invariant fields. + * We will overwrite the from and to address before we output + * the sample. + */ + perf_prepare_sample(&header, &data, counter, ®s); + + if (perf_output_begin(&handle, counter, + header.size * (top - at), 1, 1)) + return; + for (; at < top; at++) { - data->regs->ip = at->from; - data->addr = at->to; + data.ip = at->from; + data.addr = at->to; - perf_counter_output(counter, 1, data); + perf_output_sample(&handle, &header, &data, counter); } - data->regs->ip = orig_ip; - data->addr = 0; + perf_output_end(&handle); /* There's new data available. */ + counter->hw.interrupts++; counter->pending_kill = POLL_IN; } @@ -1552,13 +1573,9 @@ static void x86_pmu_disable(struct perf_counter *counter) x86_perf_counter_update(counter, hwc, idx); /* Drain the remaining BTS records. */ - if (unlikely(idx == X86_PMC_IDX_FIXED_BTS)) { - struct perf_sample_data data; - struct pt_regs regs; + if (unlikely(idx == X86_PMC_IDX_FIXED_BTS)) + intel_pmu_drain_bts_buffer(cpuc); - data.regs = ®s; - intel_pmu_drain_bts_buffer(cpuc, &data); - } cpuc->counters[idx] = NULL; clear_bit(idx, cpuc->used_mask); @@ -1619,7 +1636,6 @@ static int p6_pmu_handle_irq(struct pt_regs *regs) int idx, handled = 0; u64 val; - data.regs = regs; data.addr = 0; cpuc = &__get_cpu_var(cpu_hw_counters); @@ -1644,7 +1660,7 @@ static int p6_pmu_handle_irq(struct pt_regs *regs) if (!x86_perf_counter_set_period(counter, hwc, idx)) continue; - if (perf_counter_overflow(counter, 1, &data)) + if (perf_counter_overflow(counter, 1, &data, regs)) p6_pmu_disable_counter(hwc, idx); } @@ -1665,13 +1681,12 @@ static int intel_pmu_handle_irq(struct pt_regs *regs) int bit, loops; u64 ack, status; - data.regs = regs; data.addr = 0; cpuc = &__get_cpu_var(cpu_hw_counters); perf_disable(); - intel_pmu_drain_bts_buffer(cpuc, &data); + intel_pmu_drain_bts_buffer(cpuc); status = intel_pmu_get_status(); if (!status) { perf_enable(); @@ -1702,7 +1717,7 @@ again: data.period = counter->hw.last_period; - if (perf_counter_overflow(counter, 1, &data)) + if (perf_counter_overflow(counter, 1, &data, regs)) intel_pmu_disable_counter(&counter->hw, bit); } @@ -1729,7 +1744,6 @@ static int amd_pmu_handle_irq(struct pt_regs *regs) int idx, handled = 0; u64 val; - data.regs = regs; data.addr = 0; cpuc = &__get_cpu_var(cpu_hw_counters); @@ -1754,7 +1768,7 @@ static int amd_pmu_handle_irq(struct pt_regs *regs) if (!x86_perf_counter_set_period(counter, hwc, idx)) continue; - if (perf_counter_overflow(counter, 1, &data)) + if (perf_counter_overflow(counter, 1, &data, regs)) amd_pmu_disable_counter(hwc, idx); } diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index 6c1ef72ea501..c7375f97aa19 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -691,6 +691,17 @@ struct perf_cpu_context { int recursion[4]; }; +struct perf_output_handle { + struct perf_counter *counter; + struct perf_mmap_data *data; + unsigned long head; + unsigned long offset; + int nmi; + int sample; + int locked; + unsigned long flags; +}; + #ifdef CONFIG_PERF_COUNTERS /* @@ -722,16 +733,38 @@ extern int hw_perf_group_sched_in(struct perf_counter *group_leader, extern void perf_counter_update_userpage(struct perf_counter *counter); struct perf_sample_data { - struct pt_regs *regs; + u64 type; + + u64 ip; + struct { + u32 pid; + u32 tid; + } tid_entry; + u64 time; u64 addr; + u64 id; + u64 stream_id; + struct { + u32 cpu; + u32 reserved; + } cpu_entry; u64 period; + struct perf_callchain_entry *callchain; struct perf_raw_record *raw; }; +extern void perf_output_sample(struct perf_output_handle *handle, + struct perf_event_header *header, + struct perf_sample_data *data, + struct perf_counter *counter); +extern void perf_prepare_sample(struct perf_event_header *header, + struct perf_sample_data *data, + struct perf_counter *counter, + struct pt_regs *regs); + extern int perf_counter_overflow(struct perf_counter *counter, int nmi, - struct perf_sample_data *data); -extern void perf_counter_output(struct perf_counter *counter, int nmi, - struct perf_sample_data *data); + struct perf_sample_data *data, + struct pt_regs *regs); /* * Return 1 for a software counter, 0 for a hardware counter @@ -781,6 +814,12 @@ extern void perf_tpcounter_event(int event_id, u64 addr, u64 count, #define perf_instruction_pointer(regs) instruction_pointer(regs) #endif +extern int perf_output_begin(struct perf_output_handle *handle, + struct perf_counter *counter, unsigned int size, + int nmi, int sample); +extern void perf_output_end(struct perf_output_handle *handle); +extern void perf_output_copy(struct perf_output_handle *handle, + const void *buf, unsigned int len); #else static inline void perf_counter_task_sched_in(struct task_struct *task, int cpu) { } @@ -807,7 +846,28 @@ static inline void perf_counter_mmap(struct vm_area_struct *vma) { } static inline void perf_counter_comm(struct task_struct *tsk) { } static inline void perf_counter_fork(struct task_struct *tsk) { } static inline void perf_counter_init(void) { } + +static inline int +perf_output_begin(struct perf_output_handle *handle, struct perf_counter *c, + unsigned int size, int nmi, int sample) { } +static inline void perf_output_end(struct perf_output_handle *handle) { } +static inline void +perf_output_copy(struct perf_output_handle *handle, + const void *buf, unsigned int len) { } +static inline void +perf_output_sample(struct perf_output_handle *handle, + struct perf_event_header *header, + struct perf_sample_data *data, + struct perf_counter *counter) { } +static inline void +perf_prepare_sample(struct perf_event_header *header, + struct perf_sample_data *data, + struct perf_counter *counter, + struct pt_regs *regs) { } #endif +#define perf_output_put(handle, x) \ + perf_output_copy((handle), &(x), sizeof(x)) + #endif /* __KERNEL__ */ #endif /* _LINUX_PERF_COUNTER_H */ diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 29b73b6e8146..215845243a69 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -2512,18 +2512,6 @@ __weak struct perf_callchain_entry *perf_callchain(struct pt_regs *regs) /* * Output */ - -struct perf_output_handle { - struct perf_counter *counter; - struct perf_mmap_data *data; - unsigned long head; - unsigned long offset; - int nmi; - int sample; - int locked; - unsigned long flags; -}; - static bool perf_output_space(struct perf_mmap_data *data, unsigned long tail, unsigned long offset, unsigned long head) { @@ -2633,8 +2621,8 @@ out: local_irq_restore(handle->flags); } -static void perf_output_copy(struct perf_output_handle *handle, - const void *buf, unsigned int len) +void perf_output_copy(struct perf_output_handle *handle, + const void *buf, unsigned int len) { unsigned int pages_mask; unsigned int offset; @@ -2669,12 +2657,9 @@ static void perf_output_copy(struct perf_output_handle *handle, WARN_ON_ONCE(((long)(handle->head - handle->offset)) < 0); } -#define perf_output_put(handle, x) \ - perf_output_copy((handle), &(x), sizeof(x)) - -static int perf_output_begin(struct perf_output_handle *handle, - struct perf_counter *counter, unsigned int size, - int nmi, int sample) +int perf_output_begin(struct perf_output_handle *handle, + struct perf_counter *counter, unsigned int size, + int nmi, int sample) { struct perf_counter *output_counter; struct perf_mmap_data *data; @@ -2756,7 +2741,7 @@ out: return -ENOSPC; } -static void perf_output_end(struct perf_output_handle *handle) +void perf_output_end(struct perf_output_handle *handle) { struct perf_counter *counter = handle->counter; struct perf_mmap_data *data = handle->data; @@ -2870,82 +2855,151 @@ static void perf_output_read(struct perf_output_handle *handle, perf_output_read_one(handle, counter); } -void perf_counter_output(struct perf_counter *counter, int nmi, - struct perf_sample_data *data) +void perf_output_sample(struct perf_output_handle *handle, + struct perf_event_header *header, + struct perf_sample_data *data, + struct perf_counter *counter) +{ + u64 sample_type = data->type; + + perf_output_put(handle, *header); + + if (sample_type & PERF_SAMPLE_IP) + perf_output_put(handle, data->ip); + + if (sample_type & PERF_SAMPLE_TID) + perf_output_put(handle, data->tid_entry); + + if (sample_type & PERF_SAMPLE_TIME) + perf_output_put(handle, data->time); + + if (sample_type & PERF_SAMPLE_ADDR) + perf_output_put(handle, data->addr); + + if (sample_type & PERF_SAMPLE_ID) + perf_output_put(handle, data->id); + + if (sample_type & PERF_SAMPLE_STREAM_ID) + perf_output_put(handle, data->stream_id); + + if (sample_type & PERF_SAMPLE_CPU) + perf_output_put(handle, data->cpu_entry); + + if (sample_type & PERF_SAMPLE_PERIOD) + perf_output_put(handle, data->period); + + if (sample_type & PERF_SAMPLE_READ) + perf_output_read(handle, counter); + + if (sample_type & PERF_SAMPLE_CALLCHAIN) { + if (data->callchain) { + int size = 1; + + if (data->callchain) + size += data->callchain->nr; + + size *= sizeof(u64); + + perf_output_copy(handle, data->callchain, size); + } else { + u64 nr = 0; + perf_output_put(handle, nr); + } + } + + if (sample_type & PERF_SAMPLE_RAW) { + if (data->raw) { + perf_output_put(handle, data->raw->size); + perf_output_copy(handle, data->raw->data, + data->raw->size); + } else { + struct { + u32 size; + u32 data; + } raw = { + .size = sizeof(u32), + .data = 0, + }; + perf_output_put(handle, raw); + } + } +} + +void perf_prepare_sample(struct perf_event_header *header, + struct perf_sample_data *data, + struct perf_counter *counter, + struct pt_regs *regs) { - int ret; u64 sample_type = counter->attr.sample_type; - struct perf_output_handle handle; - struct perf_event_header header; - u64 ip; - struct { - u32 pid, tid; - } tid_entry; - struct perf_callchain_entry *callchain = NULL; - int callchain_size = 0; - u64 time; - struct { - u32 cpu, reserved; - } cpu_entry; - header.type = PERF_EVENT_SAMPLE; - header.size = sizeof(header); + data->type = sample_type; - header.misc = 0; - header.misc |= perf_misc_flags(data->regs); + header->type = PERF_EVENT_SAMPLE; + header->size = sizeof(*header); + + header->misc = 0; + header->misc |= perf_misc_flags(regs); if (sample_type & PERF_SAMPLE_IP) { - ip = perf_instruction_pointer(data->regs); - header.size += sizeof(ip); + data->ip = perf_instruction_pointer(regs); + + header->size += sizeof(data->ip); } if (sample_type & PERF_SAMPLE_TID) { /* namespace issues */ - tid_entry.pid = perf_counter_pid(counter, current); - tid_entry.tid = perf_counter_tid(counter, current); + data->tid_entry.pid = perf_counter_pid(counter, current); + data->tid_entry.tid = perf_counter_tid(counter, current); - header.size += sizeof(tid_entry); + header->size += sizeof(data->tid_entry); } if (sample_type & PERF_SAMPLE_TIME) { /* * Maybe do better on x86 and provide cpu_clock_nmi() */ - time = sched_clock(); + data->time = sched_clock(); - header.size += sizeof(u64); + header->size += sizeof(data->time); } if (sample_type & PERF_SAMPLE_ADDR) - header.size += sizeof(u64); + header->size += sizeof(data->addr); - if (sample_type & PERF_SAMPLE_ID) - header.size += sizeof(u64); + if (sample_type & PERF_SAMPLE_ID) { + data->id = primary_counter_id(counter); - if (sample_type & PERF_SAMPLE_STREAM_ID) - header.size += sizeof(u64); + header->size += sizeof(data->id); + } + + if (sample_type & PERF_SAMPLE_STREAM_ID) { + data->stream_id = counter->id; + + header->size += sizeof(data->stream_id); + } if (sample_type & PERF_SAMPLE_CPU) { - header.size += sizeof(cpu_entry); + data->cpu_entry.cpu = raw_smp_processor_id(); + data->cpu_entry.reserved = 0; - cpu_entry.cpu = raw_smp_processor_id(); - cpu_entry.reserved = 0; + header->size += sizeof(data->cpu_entry); } if (sample_type & PERF_SAMPLE_PERIOD) - header.size += sizeof(u64); + header->size += sizeof(data->period); if (sample_type & PERF_SAMPLE_READ) - header.size += perf_counter_read_size(counter); + header->size += perf_counter_read_size(counter); if (sample_type & PERF_SAMPLE_CALLCHAIN) { - callchain = perf_callchain(data->regs); + int size = 1; - if (callchain) { - callchain_size = (1 + callchain->nr) * sizeof(u64); - header.size += callchain_size; - } else - header.size += sizeof(u64); + data->callchain = perf_callchain(regs); + + if (data->callchain) + size += data->callchain->nr; + + header->size += size * sizeof(u64); } if (sample_type & PERF_SAMPLE_RAW) { @@ -2957,69 +3011,23 @@ void perf_counter_output(struct perf_counter *counter, int nmi, size += sizeof(u32); WARN_ON_ONCE(size & (sizeof(u64)-1)); - header.size += size; - } - - ret = perf_output_begin(&handle, counter, header.size, nmi, 1); - if (ret) - return; - - perf_output_put(&handle, header); - - if (sample_type & PERF_SAMPLE_IP) - perf_output_put(&handle, ip); - - if (sample_type & PERF_SAMPLE_TID) - perf_output_put(&handle, tid_entry); - - if (sample_type & PERF_SAMPLE_TIME) - perf_output_put(&handle, time); - - if (sample_type & PERF_SAMPLE_ADDR) - perf_output_put(&handle, data->addr); - - if (sample_type & PERF_SAMPLE_ID) { - u64 id = primary_counter_id(counter); - - perf_output_put(&handle, id); + header->size += size; } +} - if (sample_type & PERF_SAMPLE_STREAM_ID) - perf_output_put(&handle, counter->id); - - if (sample_type & PERF_SAMPLE_CPU) - perf_output_put(&handle, cpu_entry); - - if (sample_type & PERF_SAMPLE_PERIOD) - perf_output_put(&handle, data->period); +static void perf_counter_output(struct perf_counter *counter, int nmi, + struct perf_sample_data *data, + struct pt_regs *regs) +{ + struct perf_output_handle handle; + struct perf_event_header header; - if (sample_type & PERF_SAMPLE_READ) - perf_output_read(&handle, counter); + perf_prepare_sample(&header, data, counter, regs); - if (sample_type & PERF_SAMPLE_CALLCHAIN) { - if (callchain) - perf_output_copy(&handle, callchain, callchain_size); - else { - u64 nr = 0; - perf_output_put(&handle, nr); - } - } + if (perf_output_begin(&handle, counter, header.size, nmi, 1)) + return; - if (sample_type & PERF_SAMPLE_RAW) { - if (data->raw) { - perf_output_put(&handle, data->raw->size); - perf_output_copy(&handle, data->raw->data, data->raw->size); - } else { - struct { - u32 size; - u32 data; - } raw = { - .size = sizeof(u32), - .data = 0, - }; - perf_output_put(&handle, raw); - } - } + perf_output_sample(&handle, &header, data, counter); perf_output_end(&handle); } @@ -3501,7 +3509,8 @@ static void perf_log_throttle(struct perf_counter *counter, int enable) */ static int __perf_counter_overflow(struct perf_counter *counter, int nmi, - int throttle, struct perf_sample_data *data) + int throttle, struct perf_sample_data *data, + struct pt_regs *regs) { int events = atomic_read(&counter->event_limit); struct hw_perf_counter *hwc = &counter->hw; @@ -3557,14 +3566,15 @@ static int __perf_counter_overflow(struct perf_counter *counter, int nmi, perf_counter_disable(counter); } - perf_counter_output(counter, nmi, data); + perf_counter_output(counter, nmi, data, regs); return ret; } int perf_counter_overflow(struct perf_counter *counter, int nmi, - struct perf_sample_data *data) + struct perf_sample_data *data, + struct pt_regs *regs) { - return __perf_counter_overflow(counter, nmi, 1, data); + return __perf_counter_overflow(counter, nmi, 1, data, regs); } /* @@ -3602,7 +3612,8 @@ again: } static void perf_swcounter_overflow(struct perf_counter *counter, - int nmi, struct perf_sample_data *data) + int nmi, struct perf_sample_data *data, + struct pt_regs *regs) { struct hw_perf_counter *hwc = &counter->hw; int throttle = 0; @@ -3615,7 +3626,8 @@ static void perf_swcounter_overflow(struct perf_counter *counter, return; for (; overflow; overflow--) { - if (__perf_counter_overflow(counter, nmi, throttle, data)) { + if (__perf_counter_overflow(counter, nmi, throttle, + data, regs)) { /* * We inhibit the overflow from happening when * hwc->interrupts == MAX_INTERRUPTS. @@ -3634,7 +3646,8 @@ static void perf_swcounter_unthrottle(struct perf_counter *counter) } static void perf_swcounter_add(struct perf_counter *counter, u64 nr, - int nmi, struct perf_sample_data *data) + int nmi, struct perf_sample_data *data, + struct pt_regs *regs) { struct hw_perf_counter *hwc = &counter->hw; @@ -3643,11 +3656,11 @@ static void perf_swcounter_add(struct perf_counter *counter, u64 nr, if (!hwc->sample_period) return; - if (!data->regs) + if (!regs) return; if (!atomic64_add_negative(nr, &hwc->period_left)) - perf_swcounter_overflow(counter, nmi, data); + perf_swcounter_overflow(counter, nmi, data, regs); } static int perf_swcounter_is_counting(struct perf_counter *counter) @@ -3706,7 +3719,8 @@ static int perf_swcounter_match(struct perf_counter *counter, static void perf_swcounter_ctx_event(struct perf_counter_context *ctx, enum perf_type_id type, u32 event, u64 nr, int nmi, - struct perf_sample_data *data) + struct perf_sample_data *data, + struct pt_regs *regs) { struct perf_counter *counter; @@ -3715,8 +3729,8 @@ static void perf_swcounter_ctx_event(struct perf_counter_context *ctx, rcu_read_lock(); list_for_each_entry_rcu(counter, &ctx->event_list, event_entry) { - if (perf_swcounter_match(counter, type, event, data->regs)) - perf_swcounter_add(counter, nr, nmi, data); + if (perf_swcounter_match(counter, type, event, regs)) + perf_swcounter_add(counter, nr, nmi, data, regs); } rcu_read_unlock(); } @@ -3737,7 +3751,8 @@ static int *perf_swcounter_recursion_context(struct perf_cpu_context *cpuctx) static void do_perf_swcounter_event(enum perf_type_id type, u32 event, u64 nr, int nmi, - struct perf_sample_data *data) + struct perf_sample_data *data, + struct pt_regs *regs) { struct perf_cpu_context *cpuctx = &get_cpu_var(perf_cpu_context); int *recursion = perf_swcounter_recursion_context(cpuctx); @@ -3750,7 +3765,7 @@ static void do_perf_swcounter_event(enum perf_type_id type, u32 event, barrier(); perf_swcounter_ctx_event(&cpuctx->ctx, type, event, - nr, nmi, data); + nr, nmi, data, regs); rcu_read_lock(); /* * doesn't really matter which of the child contexts the @@ -3758,7 +3773,7 @@ static void do_perf_swcounter_event(enum perf_type_id type, u32 event, */ ctx = rcu_dereference(current->perf_counter_ctxp); if (ctx) - perf_swcounter_ctx_event(ctx, type, event, nr, nmi, data); + perf_swcounter_ctx_event(ctx, type, event, nr, nmi, data, regs); rcu_read_unlock(); barrier(); @@ -3772,11 +3787,11 @@ void __perf_swcounter_event(u32 event, u64 nr, int nmi, struct pt_regs *regs, u64 addr) { struct perf_sample_data data = { - .regs = regs, .addr = addr, }; - do_perf_swcounter_event(PERF_TYPE_SOFTWARE, event, nr, nmi, &data); + do_perf_swcounter_event(PERF_TYPE_SOFTWARE, event, nr, nmi, + &data, regs); } static void perf_swcounter_read(struct perf_counter *counter) @@ -3813,6 +3828,7 @@ static enum hrtimer_restart perf_swcounter_hrtimer(struct hrtimer *hrtimer) { enum hrtimer_restart ret = HRTIMER_RESTART; struct perf_sample_data data; + struct pt_regs *regs; struct perf_counter *counter; u64 period; @@ -3820,17 +3836,17 @@ static enum hrtimer_restart perf_swcounter_hrtimer(struct hrtimer *hrtimer) counter->pmu->read(counter); data.addr = 0; - data.regs = get_irq_regs(); + regs = get_irq_regs(); /* * In case we exclude kernel IPs or are somehow not in interrupt * context, provide the next best thing, the user IP. */ - if ((counter->attr.exclude_kernel || !data.regs) && + if ((counter->attr.exclude_kernel || !regs) && !counter->attr.exclude_user) - data.regs = task_pt_regs(current); + regs = task_pt_regs(current); - if (data.regs) { - if (perf_counter_overflow(counter, 0, &data)) + if (regs) { + if (perf_counter_overflow(counter, 0, &data, regs)) ret = HRTIMER_NORESTART; } @@ -3966,15 +3982,17 @@ void perf_tpcounter_event(int event_id, u64 addr, u64 count, void *record, }; struct perf_sample_data data = { - .regs = get_irq_regs(), .addr = addr, .raw = &raw, }; - if (!data.regs) - data.regs = task_pt_regs(current); + struct pt_regs *regs = get_irq_regs(); + + if (!regs) + regs = task_pt_regs(current); - do_perf_swcounter_event(PERF_TYPE_TRACEPOINT, event_id, count, 1, &data); + do_perf_swcounter_event(PERF_TYPE_TRACEPOINT, event_id, count, 1, + &data, regs); } EXPORT_SYMBOL_GPL(perf_tpcounter_event); -- cgit v1.2.3 From 2fbd07a5f5d1295fa9b0c0564ec27da7c276a75a Mon Sep 17 00:00:00 2001 From: Suresh Siddha Date: Fri, 18 Sep 2009 19:29:59 -0700 Subject: x86, apic: Use logical flat on intel with <= 8 logical cpus On Intel platforms, we can use logical flat mode if there are <= 8 logical cpu's (irrespective of physical apic id values). This will enable simplified and efficient IPI and device interrupt routing on such platforms. Fix the relevant comments while we are at it. We can clean up default_setup_apic_routing() by using apic->probe() but that is a different item. Signed-off-by: Suresh Siddha Cc: "yinghai@kernel.org" LKML-Reference: <1253327399.3948.747.camel@sbs-t61.sc.intel.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic/apic.c | 26 ++++++++------------------ arch/x86/kernel/apic/probe_64.c | 15 +++++++++++---- 2 files changed, 19 insertions(+), 22 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index 79e5b92a5800..072aea6c630b 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -61,7 +61,7 @@ unsigned int boot_cpu_physical_apicid = -1U; /* * The highest APIC ID seen during enumeration. * - * This determines the messaging protocol we can use: if all APIC IDs + * On AMD, this determines the messaging protocol we can use: if all APIC IDs * are in the 0 ... 7 range, then we can use logical addressing which * has some performance advantages (better broadcasting). * @@ -1915,24 +1915,14 @@ void __cpuinit generic_processor_info(int apicid, int version) max_physical_apicid = apicid; #ifdef CONFIG_X86_32 - /* - * Would be preferable to switch to bigsmp when CONFIG_HOTPLUG_CPU=y - * but we need to work other dependencies like SMP_SUSPEND etc - * before this can be done without some confusion. - * if (CPU_HOTPLUG_ENABLED || num_processors > 8) - * - Ashok Raj - */ - if (max_physical_apicid >= 8) { - switch (boot_cpu_data.x86_vendor) { - case X86_VENDOR_INTEL: - if (!APIC_XAPIC(version)) { - def_to_bigsmp = 0; - break; - } - /* If P4 and above fall through */ - case X86_VENDOR_AMD: + switch (boot_cpu_data.x86_vendor) { + case X86_VENDOR_INTEL: + if (num_processors > 8) + def_to_bigsmp = 1; + break; + case X86_VENDOR_AMD: + if (max_physical_apicid >= 8) def_to_bigsmp = 1; - } } #endif diff --git a/arch/x86/kernel/apic/probe_64.c b/arch/x86/kernel/apic/probe_64.c index 65edc180fc82..c4cbd3080c1c 100644 --- a/arch/x86/kernel/apic/probe_64.c +++ b/arch/x86/kernel/apic/probe_64.c @@ -64,16 +64,23 @@ void __init default_setup_apic_routing(void) apic = &apic_x2apic_phys; else apic = &apic_x2apic_cluster; - printk(KERN_INFO "Setting APIC routing to %s\n", apic->name); } #endif if (apic == &apic_flat) { - if (max_physical_apicid >= 8) - apic = &apic_physflat; - printk(KERN_INFO "Setting APIC routing to %s\n", apic->name); + switch (boot_cpu_data.x86_vendor) { + case X86_VENDOR_INTEL: + if (num_processors > 8) + apic = &apic_physflat; + break; + case X86_VENDOR_AMD: + if (max_physical_apicid >= 8) + apic = &apic_physflat; + } } + printk(KERN_INFO "Setting APIC routing to %s\n", apic->name); + if (is_vsmp_box()) { /* need to update phys_pkg_id */ apic->phys_pkg_id = apicid_phys_pkg_id; -- cgit v1.2.3 From 6161352142d5fed4cd753b32e5ccde66e705b14e Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Thu, 17 Sep 2009 16:11:28 +0200 Subject: tracing, perf: Convert the power tracer into an event tracer This patch converts the existing power tracer into an event tracer, so that power events (C states and frequency changes) can be tracked via "perf". This also removes the perl script that was used to demo the tracer; its functionality is being replaced entirely with timechart. Signed-off-by: Arjan van de Ven Acked-by: Peter Zijlstra Cc: Paul Mackerras Cc: Frederic Weisbecker LKML-Reference: <20090912130542.6d314860@infradead.org> Signed-off-by: Ingo Molnar --- Documentation/trace/power.txt | 17 --- arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c | 7 +- arch/x86/kernel/process.c | 28 ++-- include/trace/events/power.h | 81 +++++++++++ kernel/trace/Makefile | 2 +- kernel/trace/power-traces.c | 20 +++ kernel/trace/trace.h | 3 - kernel/trace/trace_entries.h | 17 --- kernel/trace/trace_power.c | 218 ----------------------------- scripts/tracing/power.pl | 108 -------------- 10 files changed, 113 insertions(+), 388 deletions(-) delete mode 100644 Documentation/trace/power.txt create mode 100644 include/trace/events/power.h create mode 100644 kernel/trace/power-traces.c delete mode 100644 kernel/trace/trace_power.c delete mode 100644 scripts/tracing/power.pl (limited to 'arch/x86/kernel') diff --git a/Documentation/trace/power.txt b/Documentation/trace/power.txt deleted file mode 100644 index cd805e16dc27..000000000000 --- a/Documentation/trace/power.txt +++ /dev/null @@ -1,17 +0,0 @@ -The power tracer collects detailed information about C-state and P-state -transitions, instead of just looking at the high-level "average" -information. - -There is a helper script found in scrips/tracing/power.pl in the kernel -sources which can be used to parse this information and create a -Scalable Vector Graphics (SVG) picture from the trace data. - -To use this tracer: - - echo 0 > /sys/kernel/debug/tracing/tracing_enabled - echo power > /sys/kernel/debug/tracing/current_tracer - echo 1 > /sys/kernel/debug/tracing/tracing_enabled - sleep 1 - echo 0 > /sys/kernel/debug/tracing/tracing_enabled - cat /sys/kernel/debug/tracing/trace | \ - perl scripts/tracing/power.pl > out.sv diff --git a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c index 4109679863c1..479cc8c418c1 100644 --- a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c +++ b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c @@ -33,7 +33,7 @@ #include #include #include -#include +#include #include #include @@ -72,8 +72,6 @@ static DEFINE_PER_CPU(struct acpi_cpufreq_data *, drv_data); static DEFINE_PER_CPU(struct aperfmperf, old_perf); -DEFINE_TRACE(power_mark); - /* acpi_perf_data is a pointer to percpu data. */ static struct acpi_processor_performance *acpi_perf_data; @@ -332,7 +330,6 @@ static int acpi_cpufreq_target(struct cpufreq_policy *policy, unsigned int next_perf_state = 0; /* Index into perf table */ unsigned int i; int result = 0; - struct power_trace it; dprintk("acpi_cpufreq_target %d (%d)\n", target_freq, policy->cpu); @@ -364,7 +361,7 @@ static int acpi_cpufreq_target(struct cpufreq_policy *policy, } } - trace_power_mark(&it, POWER_PSTATE, next_perf_state); + trace_power_frequency(POWER_PSTATE, data->freq_table[next_state].frequency); switch (data->cpu_feature) { case SYSTEM_INTEL_MSR_CAPABLE: diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index 071166a4ba83..7b60e3906889 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -9,7 +9,7 @@ #include #include #include -#include +#include #include #include #include @@ -25,9 +25,6 @@ EXPORT_SYMBOL(idle_nomwait); struct kmem_cache *task_xstate_cachep; -DEFINE_TRACE(power_start); -DEFINE_TRACE(power_end); - int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src) { *dst = *src; @@ -299,9 +296,7 @@ static inline int hlt_use_halt(void) void default_idle(void) { if (hlt_use_halt()) { - struct power_trace it; - - trace_power_start(&it, POWER_CSTATE, 1); + trace_power_start(POWER_CSTATE, 1); current_thread_info()->status &= ~TS_POLLING; /* * TS_POLLING-cleared state must be visible before we @@ -314,7 +309,7 @@ void default_idle(void) else local_irq_enable(); current_thread_info()->status |= TS_POLLING; - trace_power_end(&it); + trace_power_end(0); } else { local_irq_enable(); /* loop is done by the caller */ @@ -372,9 +367,7 @@ EXPORT_SYMBOL_GPL(cpu_idle_wait); */ void mwait_idle_with_hints(unsigned long ax, unsigned long cx) { - struct power_trace it; - - trace_power_start(&it, POWER_CSTATE, (ax>>4)+1); + trace_power_start(POWER_CSTATE, (ax>>4)+1); if (!need_resched()) { if (cpu_has(¤t_cpu_data, X86_FEATURE_CLFLUSH_MONITOR)) clflush((void *)¤t_thread_info()->flags); @@ -384,15 +377,14 @@ void mwait_idle_with_hints(unsigned long ax, unsigned long cx) if (!need_resched()) __mwait(ax, cx); } - trace_power_end(&it); + trace_power_end(0); } /* Default MONITOR/MWAIT with no hints, used for default C1 state */ static void mwait_idle(void) { - struct power_trace it; if (!need_resched()) { - trace_power_start(&it, POWER_CSTATE, 1); + trace_power_start(POWER_CSTATE, 1); if (cpu_has(¤t_cpu_data, X86_FEATURE_CLFLUSH_MONITOR)) clflush((void *)¤t_thread_info()->flags); @@ -402,7 +394,7 @@ static void mwait_idle(void) __sti_mwait(0, 0); else local_irq_enable(); - trace_power_end(&it); + trace_power_end(0); } else local_irq_enable(); } @@ -414,13 +406,11 @@ static void mwait_idle(void) */ static void poll_idle(void) { - struct power_trace it; - - trace_power_start(&it, POWER_CSTATE, 0); + trace_power_start(POWER_CSTATE, 0); local_irq_enable(); while (!need_resched()) cpu_relax(); - trace_power_end(&it); + trace_power_end(0); } /* diff --git a/include/trace/events/power.h b/include/trace/events/power.h new file mode 100644 index 000000000000..ea6d579261ad --- /dev/null +++ b/include/trace/events/power.h @@ -0,0 +1,81 @@ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM power + +#if !defined(_TRACE_POWER_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_POWER_H + +#include +#include + +#ifndef _TRACE_POWER_ENUM_ +#define _TRACE_POWER_ENUM_ +enum { + POWER_NONE = 0, + POWER_CSTATE = 1, + POWER_PSTATE = 2, +}; +#endif + + + +TRACE_EVENT(power_start, + + TP_PROTO(unsigned int type, unsigned int state), + + TP_ARGS(type, state), + + TP_STRUCT__entry( + __field( u64, type ) + __field( u64, state ) + ), + + TP_fast_assign( + __entry->type = type; + __entry->state = state; + ), + + TP_printk("type=%lu state=%lu", (unsigned long)__entry->type, (unsigned long)__entry->state) +); + +TRACE_EVENT(power_end, + + TP_PROTO(int dummy), + + TP_ARGS(dummy), + + TP_STRUCT__entry( + __field( u64, dummy ) + ), + + TP_fast_assign( + __entry->dummy = 0xffff; + ), + + TP_printk("dummy=%lu", (unsigned long)__entry->dummy) + +); + + +TRACE_EVENT(power_frequency, + + TP_PROTO(unsigned int type, unsigned int state), + + TP_ARGS(type, state), + + TP_STRUCT__entry( + __field( u64, type ) + __field( u64, state ) + ), + + TP_fast_assign( + __entry->type = type; + __entry->state = state; + ), + + TP_printk("type=%lu state=%lu", (unsigned long)__entry->type, (unsigned long) __entry->state) +); + +#endif /* _TRACE_POWER_H */ + +/* This part must be outside protection */ +#include diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile index 844164dca90a..26f03ac07c2b 100644 --- a/kernel/trace/Makefile +++ b/kernel/trace/Makefile @@ -42,7 +42,6 @@ obj-$(CONFIG_BOOT_TRACER) += trace_boot.o obj-$(CONFIG_FUNCTION_GRAPH_TRACER) += trace_functions_graph.o obj-$(CONFIG_TRACE_BRANCH_PROFILING) += trace_branch.o obj-$(CONFIG_HW_BRANCH_TRACER) += trace_hw_branches.o -obj-$(CONFIG_POWER_TRACER) += trace_power.o obj-$(CONFIG_KMEMTRACE) += kmemtrace.o obj-$(CONFIG_WORKQUEUE_TRACER) += trace_workqueue.o obj-$(CONFIG_BLK_DEV_IO_TRACE) += blktrace.o @@ -54,5 +53,6 @@ obj-$(CONFIG_EVENT_TRACING) += trace_export.o obj-$(CONFIG_FTRACE_SYSCALLS) += trace_syscalls.o obj-$(CONFIG_EVENT_PROFILE) += trace_event_profile.o obj-$(CONFIG_EVENT_TRACING) += trace_events_filter.o +obj-$(CONFIG_EVENT_TRACING) += power-traces.o libftrace-y := ftrace.o diff --git a/kernel/trace/power-traces.c b/kernel/trace/power-traces.c new file mode 100644 index 000000000000..e06c6e3d56a3 --- /dev/null +++ b/kernel/trace/power-traces.c @@ -0,0 +1,20 @@ +/* + * Power trace points + * + * Copyright (C) 2009 Arjan van de Ven + */ + +#include +#include +#include +#include +#include +#include + +#define CREATE_TRACE_POINTS +#include + +EXPORT_TRACEPOINT_SYMBOL_GPL(power_start); +EXPORT_TRACEPOINT_SYMBOL_GPL(power_end); +EXPORT_TRACEPOINT_SYMBOL_GPL(power_frequency); + diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 86bcff94791a..405cb850b75d 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -11,7 +11,6 @@ #include #include #include -#include #include #include @@ -37,7 +36,6 @@ enum trace_type { TRACE_HW_BRANCHES, TRACE_KMEM_ALLOC, TRACE_KMEM_FREE, - TRACE_POWER, TRACE_BLK, __TRACE_LAST_TYPE, @@ -207,7 +205,6 @@ extern void __ftrace_bad_type(void); IF_ASSIGN(var, ent, struct ftrace_graph_ret_entry, \ TRACE_GRAPH_RET); \ IF_ASSIGN(var, ent, struct hw_branch_entry, TRACE_HW_BRANCHES);\ - IF_ASSIGN(var, ent, struct trace_power, TRACE_POWER); \ IF_ASSIGN(var, ent, struct kmemtrace_alloc_entry, \ TRACE_KMEM_ALLOC); \ IF_ASSIGN(var, ent, struct kmemtrace_free_entry, \ diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h index a431748ddd6e..ead3d724599d 100644 --- a/kernel/trace/trace_entries.h +++ b/kernel/trace/trace_entries.h @@ -330,23 +330,6 @@ FTRACE_ENTRY(hw_branch, hw_branch_entry, F_printk("from: %llx to: %llx", __entry->from, __entry->to) ); -FTRACE_ENTRY(power, trace_power, - - TRACE_POWER, - - F_STRUCT( - __field_struct( struct power_trace, state_data ) - __field_desc( s64, state_data, stamp ) - __field_desc( s64, state_data, end ) - __field_desc( int, state_data, type ) - __field_desc( int, state_data, state ) - ), - - F_printk("%llx->%llx type:%u state:%u", - __entry->stamp, __entry->end, - __entry->type, __entry->state) -); - FTRACE_ENTRY(kmem_alloc, kmemtrace_alloc_entry, TRACE_KMEM_ALLOC, diff --git a/kernel/trace/trace_power.c b/kernel/trace/trace_power.c deleted file mode 100644 index fe1a00f1445a..000000000000 --- a/kernel/trace/trace_power.c +++ /dev/null @@ -1,218 +0,0 @@ -/* - * ring buffer based C-state tracer - * - * Arjan van de Ven - * Copyright (C) 2008 Intel Corporation - * - * Much is borrowed from trace_boot.c which is - * Copyright (C) 2008 Frederic Weisbecker - * - */ - -#include -#include -#include -#include -#include - -#include "trace.h" -#include "trace_output.h" - -static struct trace_array *power_trace; -static int __read_mostly trace_power_enabled; - -static void probe_power_start(struct power_trace *it, unsigned int type, - unsigned int level) -{ - if (!trace_power_enabled) - return; - - memset(it, 0, sizeof(struct power_trace)); - it->state = level; - it->type = type; - it->stamp = ktime_get(); -} - - -static void probe_power_end(struct power_trace *it) -{ - struct ftrace_event_call *call = &event_power; - struct ring_buffer_event *event; - struct ring_buffer *buffer; - struct trace_power *entry; - struct trace_array_cpu *data; - struct trace_array *tr = power_trace; - - if (!trace_power_enabled) - return; - - buffer = tr->buffer; - - preempt_disable(); - it->end = ktime_get(); - data = tr->data[smp_processor_id()]; - - event = trace_buffer_lock_reserve(buffer, TRACE_POWER, - sizeof(*entry), 0, 0); - if (!event) - goto out; - entry = ring_buffer_event_data(event); - entry->state_data = *it; - if (!filter_check_discard(call, entry, buffer, event)) - trace_buffer_unlock_commit(buffer, event, 0, 0); - out: - preempt_enable(); -} - -static void probe_power_mark(struct power_trace *it, unsigned int type, - unsigned int level) -{ - struct ftrace_event_call *call = &event_power; - struct ring_buffer_event *event; - struct ring_buffer *buffer; - struct trace_power *entry; - struct trace_array_cpu *data; - struct trace_array *tr = power_trace; - - if (!trace_power_enabled) - return; - - buffer = tr->buffer; - - memset(it, 0, sizeof(struct power_trace)); - it->state = level; - it->type = type; - it->stamp = ktime_get(); - preempt_disable(); - it->end = it->stamp; - data = tr->data[smp_processor_id()]; - - event = trace_buffer_lock_reserve(buffer, TRACE_POWER, - sizeof(*entry), 0, 0); - if (!event) - goto out; - entry = ring_buffer_event_data(event); - entry->state_data = *it; - if (!filter_check_discard(call, entry, buffer, event)) - trace_buffer_unlock_commit(buffer, event, 0, 0); - out: - preempt_enable(); -} - -static int tracing_power_register(void) -{ - int ret; - - ret = register_trace_power_start(probe_power_start); - if (ret) { - pr_info("power trace: Couldn't activate tracepoint" - " probe to trace_power_start\n"); - return ret; - } - ret = register_trace_power_end(probe_power_end); - if (ret) { - pr_info("power trace: Couldn't activate tracepoint" - " probe to trace_power_end\n"); - goto fail_start; - } - ret = register_trace_power_mark(probe_power_mark); - if (ret) { - pr_info("power trace: Couldn't activate tracepoint" - " probe to trace_power_mark\n"); - goto fail_end; - } - return ret; -fail_end: - unregister_trace_power_end(probe_power_end); -fail_start: - unregister_trace_power_start(probe_power_start); - return ret; -} - -static void start_power_trace(struct trace_array *tr) -{ - trace_power_enabled = 1; -} - -static void stop_power_trace(struct trace_array *tr) -{ - trace_power_enabled = 0; -} - -static void power_trace_reset(struct trace_array *tr) -{ - trace_power_enabled = 0; - unregister_trace_power_start(probe_power_start); - unregister_trace_power_end(probe_power_end); - unregister_trace_power_mark(probe_power_mark); -} - - -static int power_trace_init(struct trace_array *tr) -{ - power_trace = tr; - - trace_power_enabled = 1; - tracing_power_register(); - - tracing_reset_online_cpus(tr); - return 0; -} - -static enum print_line_t power_print_line(struct trace_iterator *iter) -{ - int ret = 0; - struct trace_entry *entry = iter->ent; - struct trace_power *field ; - struct power_trace *it; - struct trace_seq *s = &iter->seq; - struct timespec stamp; - struct timespec duration; - - trace_assign_type(field, entry); - it = &field->state_data; - stamp = ktime_to_timespec(it->stamp); - duration = ktime_to_timespec(ktime_sub(it->end, it->stamp)); - - if (entry->type == TRACE_POWER) { - if (it->type == POWER_CSTATE) - ret = trace_seq_printf(s, "[%5ld.%09ld] CSTATE: Going to C%i on cpu %i for %ld.%09ld\n", - stamp.tv_sec, - stamp.tv_nsec, - it->state, iter->cpu, - duration.tv_sec, - duration.tv_nsec); - if (it->type == POWER_PSTATE) - ret = trace_seq_printf(s, "[%5ld.%09ld] PSTATE: Going to P%i on cpu %i\n", - stamp.tv_sec, - stamp.tv_nsec, - it->state, iter->cpu); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; - return TRACE_TYPE_HANDLED; - } - return TRACE_TYPE_UNHANDLED; -} - -static void power_print_header(struct seq_file *s) -{ - seq_puts(s, "# TIMESTAMP STATE EVENT\n"); - seq_puts(s, "# | | |\n"); -} - -static struct tracer power_tracer __read_mostly = -{ - .name = "power", - .init = power_trace_init, - .start = start_power_trace, - .stop = stop_power_trace, - .reset = power_trace_reset, - .print_line = power_print_line, - .print_header = power_print_header, -}; - -static int init_power_trace(void) -{ - return register_tracer(&power_tracer); -} -device_initcall(init_power_trace); diff --git a/scripts/tracing/power.pl b/scripts/tracing/power.pl deleted file mode 100644 index 4f729b3501e0..000000000000 --- a/scripts/tracing/power.pl +++ /dev/null @@ -1,108 +0,0 @@ -#!/usr/bin/perl - -# Copyright 2008, Intel Corporation -# -# This file is part of the Linux kernel -# -# This program file is free software; you can redistribute it and/or modify it -# under the terms of the GNU General Public License as published by the -# Free Software Foundation; version 2 of the License. -# -# This program is distributed in the hope that it will be useful, but WITHOUT -# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or -# FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License -# for more details. -# -# You should have received a copy of the GNU General Public License -# along with this program in a file named COPYING; if not, write to the -# Free Software Foundation, Inc., -# 51 Franklin Street, Fifth Floor, -# Boston, MA 02110-1301 USA -# -# Authors: -# Arjan van de Ven - - -# -# This script turns a cstate ftrace output into a SVG graphic that shows -# historic C-state information -# -# -# cat /sys/kernel/debug/tracing/trace | perl power.pl > out.svg -# - -my @styles; -my $base = 0; - -my @pstate_last; -my @pstate_level; - -$styles[0] = "fill:rgb(0,0,255);fill-opacity:0.5;stroke-width:1;stroke:rgb(0,0,0)"; -$styles[1] = "fill:rgb(0,255,0);fill-opacity:0.5;stroke-width:1;stroke:rgb(0,0,0)"; -$styles[2] = "fill:rgb(255,0,20);fill-opacity:0.5;stroke-width:1;stroke:rgb(0,0,0)"; -$styles[3] = "fill:rgb(255,255,20);fill-opacity:0.5;stroke-width:1;stroke:rgb(0,0,0)"; -$styles[4] = "fill:rgb(255,0,255);fill-opacity:0.5;stroke-width:1;stroke:rgb(0,0,0)"; -$styles[5] = "fill:rgb(0,255,255);fill-opacity:0.5;stroke-width:1;stroke:rgb(0,0,0)"; -$styles[6] = "fill:rgb(0,128,255);fill-opacity:0.5;stroke-width:1;stroke:rgb(0,0,0)"; -$styles[7] = "fill:rgb(0,255,128);fill-opacity:0.5;stroke-width:1;stroke:rgb(0,0,0)"; -$styles[8] = "fill:rgb(0,25,20);fill-opacity:0.5;stroke-width:1;stroke:rgb(0,0,0)"; - - -print " \n"; -print "\n"; - -my $scale = 30000.0; -while (<>) { - my $line = $_; - if ($line =~ /([0-9\.]+)\] CSTATE: Going to C([0-9]) on cpu ([0-9]+) for ([0-9\.]+)/) { - if ($base == 0) { - $base = $1; - } - my $time = $1 - $base; - $time = $time * $scale; - my $C = $2; - my $cpu = $3; - my $y = 400 * $cpu; - my $duration = $4 * $scale; - my $msec = int($4 * 100000)/100.0; - my $height = $C * 20; - $style = $styles[$C]; - - $y = $y + 140 - $height; - - $x2 = $time + 4; - $y2 = $y + 4; - - - print "\n"; - print "C$C $msec\n"; - } - if ($line =~ /([0-9\.]+)\] PSTATE: Going to P([0-9]) on cpu ([0-9]+)/) { - my $time = $1 - $base; - my $state = $2; - my $cpu = $3; - - if (defined($pstate_last[$cpu])) { - my $from = $pstate_last[$cpu]; - my $oldstate = $pstate_state[$cpu]; - my $duration = ($time-$from) * $scale; - - $from = $from * $scale; - my $to = $from + $duration; - my $height = 140 - ($oldstate * (140/8)); - - my $y = 400 * $cpu + 200 + $height; - my $y2 = $y+4; - my $style = $styles[8]; - - print "\n"; - print "P$oldstate (cpu $cpu)\n"; - }; - - $pstate_last[$cpu] = $time; - $pstate_state[$cpu] = $state; - } -} - - -print "\n"; -- cgit v1.2.3 From 288f023e708efd89d77ce9acf977a33a623ae83d Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Sat, 19 Sep 2009 13:35:33 +0200 Subject: tracing, x86, cpuidle: Move the end point of a C state in the power tracer The "end of a C state" trace point currently happens before the code runs that corrects the TSC for having stopped during idle. The result of this is that the timestamp of the end-of-C-state event is garbage on cpus where the TSC stops during idle. This patch moves the end point of the C state to after the timekeeping engine of the kernel has been corrected. Signed-off-by: Arjan van de Ven Cc: Len Brown Cc: fweisbec@gmail.com Cc: peterz@infradead.org Cc: Paul Mackerras LKML-Reference: <20090919133533.139c2a46@infradead.org> Signed-off-by: Ingo Molnar --- arch/x86/kernel/process.c | 3 --- drivers/cpuidle/cpuidle.c | 2 ++ 2 files changed, 2 insertions(+), 3 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index 7b60e3906889..847ab4160315 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -309,7 +309,6 @@ void default_idle(void) else local_irq_enable(); current_thread_info()->status |= TS_POLLING; - trace_power_end(0); } else { local_irq_enable(); /* loop is done by the caller */ @@ -377,7 +376,6 @@ void mwait_idle_with_hints(unsigned long ax, unsigned long cx) if (!need_resched()) __mwait(ax, cx); } - trace_power_end(0); } /* Default MONITOR/MWAIT with no hints, used for default C1 state */ @@ -394,7 +392,6 @@ static void mwait_idle(void) __sti_mwait(0, 0); else local_irq_enable(); - trace_power_end(0); } else local_irq_enable(); } diff --git a/drivers/cpuidle/cpuidle.c b/drivers/cpuidle/cpuidle.c index 8504a2108557..ad41f19b8e3f 100644 --- a/drivers/cpuidle/cpuidle.c +++ b/drivers/cpuidle/cpuidle.c @@ -17,6 +17,7 @@ #include #include #include +#include #include "cpuidle.h" @@ -91,6 +92,7 @@ static void cpuidle_idle_call(void) /* give the governor an opportunity to reflect on the outcome */ if (cpuidle_curr_governor->reflect) cpuidle_curr_governor->reflect(dev); + trace_power_end(0); } /** -- cgit v1.2.3 From a017421ddcdf34474f1133b27e33ab2ca6c19b96 Mon Sep 17 00:00:00 2001 From: Andreas Herrmann Date: Tue, 15 Sep 2009 17:17:27 +0200 Subject: x86, mce: Fix compile warning in case of CONFIG_SMP=n Fix following compile warning: arch/x86/kernel/cpu/mcheck/mce_amd.c: In function 'threshold_create_bank': arch/x86/kernel/cpu/mcheck/mce_amd.c:492: warning: unused variable 'c' which shows up when kernel is compiled with CONFIG_SMP=n. Signed-off-by: Andreas Herrmann LKML-Reference: <20090915151727.GB21670@alberich.amd.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/mcheck/mce_amd.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c index 8cd5224943b5..83a3d1f4efca 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_amd.c +++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c @@ -489,8 +489,9 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank) int i, err = 0; struct threshold_bank *b = NULL; char name[32]; +#ifdef CONFIG_SMP struct cpuinfo_x86 *c = &cpu_data(cpu); - +#endif sprintf(name, "threshold_bank%i", bank); -- cgit v1.2.3 From eda6da9286ad5b35b1eb70f6368958a8ee41a9dd Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Sat, 19 Sep 2009 11:07:57 -0700 Subject: Revert 'x86: Fix system crash when loading with "reservetop" parameter' After close looking, commit 8126dec3 will break: 1. some cpu feature in early stage too, like cpu_has_x2apic 2. will break built-in-command line 3. will break other memmap= and mem= 4. early_dbgp and early_console that will use early_ioremap to access mmio (?) So revert it. Reported-by: Hugh Dickins , Cc: Linus Torvalds , Cc: Andrew Morton , LKML-Reference: <4AB51DFD.2000904@kernel.org> Signed-off-by: Ingo Molnar --- arch/x86/kernel/setup.c | 30 +++++++++++++++--------------- 1 file changed, 15 insertions(+), 15 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 19f15c4076fb..f5baa2ae0c87 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -712,21 +712,6 @@ void __init setup_arch(char **cmdline_p) printk(KERN_INFO "Command line: %s\n", boot_command_line); #endif - strlcpy(command_line, boot_command_line, COMMAND_LINE_SIZE); - *cmdline_p = command_line; - -#ifdef CONFIG_X86_64 - /* - * Must call this twice: Once just to detect whether hardware doesn't - * support NX (so that the early EHCI debug console setup can safely - * call set_fixmap(), and then again after parsing early parameters to - * honor the respective command line option. - */ - check_efer(); -#endif - - parse_early_param(); - /* VMI may relocate the fixmap; do this before touching ioremap area */ vmi_init(); @@ -809,6 +794,21 @@ void __init setup_arch(char **cmdline_p) #endif #endif + strlcpy(command_line, boot_command_line, COMMAND_LINE_SIZE); + *cmdline_p = command_line; + +#ifdef CONFIG_X86_64 + /* + * Must call this twice: Once just to detect whether hardware doesn't + * support NX (so that the early EHCI debug console setup can safely + * call set_fixmap(), and then again after parsing early parameters to + * honor the respective command line option. + */ + check_efer(); +#endif + + parse_early_param(); + #ifdef CONFIG_X86_64 check_efer(); #endif -- cgit v1.2.3 From e454cea20bdcff10ee698d11b8882662a0153a47 Mon Sep 17 00:00:00 2001 From: Kay Sievers Date: Fri, 18 Sep 2009 23:01:12 +0200 Subject: Driver-Core: extend devnode callbacks to provide permissions This allows subsytems to provide devtmpfs with non-default permissions for the device node. Instead of the default mode of 0600, null, zero, random, urandom, full, tty, ptmx now have a mode of 0666, which allows non-privileged processes to access standard device nodes in case no other userspace process applies the expected permissions. This also fixes a wrong assignment in pktcdvd and a checkpatch.pl complain. Signed-off-by: Kay Sievers Signed-off-by: Greg Kroah-Hartman --- arch/x86/kernel/cpuid.c | 4 ++-- arch/x86/kernel/microcode_core.c | 2 +- arch/x86/kernel/msr.c | 4 ++-- block/bsg.c | 4 ++-- block/genhd.c | 8 ++++---- drivers/base/core.c | 19 ++++++++++++------- drivers/base/devtmpfs.c | 24 ++++++++++++++++-------- drivers/block/aoe/aoechr.c | 4 ++-- drivers/block/pktcdvd.c | 6 +++--- drivers/char/hw_random/core.c | 2 +- drivers/char/mem.c | 29 +++++++++++++++++++---------- drivers/char/misc.c | 10 ++++++---- drivers/char/raw.c | 4 ++-- drivers/char/tty_io.c | 11 +++++++++++ drivers/gpu/drm/drm_sysfs.c | 4 ++-- drivers/hid/usbhid/hiddev.c | 4 ++-- drivers/input/input.c | 4 ++-- drivers/md/dm-ioctl.c | 2 +- drivers/media/dvb/dvb-core/dvbdev.c | 4 ++-- drivers/net/tun.c | 2 +- drivers/usb/class/usblp.c | 4 ++-- drivers/usb/core/file.c | 8 ++++---- drivers/usb/core/usb.c | 4 ++-- drivers/usb/misc/iowarrior.c | 4 ++-- drivers/usb/misc/legousbtower.c | 4 ++-- include/linux/device.h | 7 ++++--- include/linux/genhd.h | 2 +- include/linux/miscdevice.h | 3 ++- include/linux/usb.h | 4 ++-- sound/sound_core.c | 4 ++-- 30 files changed, 116 insertions(+), 79 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpuid.c b/arch/x86/kernel/cpuid.c index b07af8861244..6a52d4b36a30 100644 --- a/arch/x86/kernel/cpuid.c +++ b/arch/x86/kernel/cpuid.c @@ -182,7 +182,7 @@ static struct notifier_block __refdata cpuid_class_cpu_notifier = .notifier_call = cpuid_class_cpu_callback, }; -static char *cpuid_nodename(struct device *dev) +static char *cpuid_devnode(struct device *dev, mode_t *mode) { return kasprintf(GFP_KERNEL, "cpu/%u/cpuid", MINOR(dev->devt)); } @@ -203,7 +203,7 @@ static int __init cpuid_init(void) err = PTR_ERR(cpuid_class); goto out_chrdev; } - cpuid_class->nodename = cpuid_nodename; + cpuid_class->devnode = cpuid_devnode; for_each_online_cpu(i) { err = cpuid_device_create(i); if (err != 0) diff --git a/arch/x86/kernel/microcode_core.c b/arch/x86/kernel/microcode_core.c index 9371448290ac..0db7969b0dde 100644 --- a/arch/x86/kernel/microcode_core.c +++ b/arch/x86/kernel/microcode_core.c @@ -236,7 +236,7 @@ static const struct file_operations microcode_fops = { static struct miscdevice microcode_dev = { .minor = MICROCODE_MINOR, .name = "microcode", - .devnode = "cpu/microcode", + .nodename = "cpu/microcode", .fops = µcode_fops, }; diff --git a/arch/x86/kernel/msr.c b/arch/x86/kernel/msr.c index 7dd950094178..6a3cefc7dda1 100644 --- a/arch/x86/kernel/msr.c +++ b/arch/x86/kernel/msr.c @@ -241,7 +241,7 @@ static struct notifier_block __refdata msr_class_cpu_notifier = { .notifier_call = msr_class_cpu_callback, }; -static char *msr_nodename(struct device *dev) +static char *msr_devnode(struct device *dev, mode_t *mode) { return kasprintf(GFP_KERNEL, "cpu/%u/msr", MINOR(dev->devt)); } @@ -262,7 +262,7 @@ static int __init msr_init(void) err = PTR_ERR(msr_class); goto out_chrdev; } - msr_class->nodename = msr_nodename; + msr_class->devnode = msr_devnode; for_each_online_cpu(i) { err = msr_device_create(i); if (err != 0) diff --git a/block/bsg.c b/block/bsg.c index 5f184bb3ff9e..0676301f16d0 100644 --- a/block/bsg.c +++ b/block/bsg.c @@ -1062,7 +1062,7 @@ EXPORT_SYMBOL_GPL(bsg_register_queue); static struct cdev bsg_cdev; -static char *bsg_nodename(struct device *dev) +static char *bsg_devnode(struct device *dev, mode_t *mode) { return kasprintf(GFP_KERNEL, "bsg/%s", dev_name(dev)); } @@ -1087,7 +1087,7 @@ static int __init bsg_init(void) ret = PTR_ERR(bsg_class); goto destroy_kmemcache; } - bsg_class->nodename = bsg_nodename; + bsg_class->devnode = bsg_devnode; ret = alloc_chrdev_region(&devid, 0, BSG_MAX_DEVS, "bsg"); if (ret) diff --git a/block/genhd.c b/block/genhd.c index 2ad91ddad8e2..517e4332cb37 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -998,12 +998,12 @@ struct class block_class = { .name = "block", }; -static char *block_nodename(struct device *dev) +static char *block_devnode(struct device *dev, mode_t *mode) { struct gendisk *disk = dev_to_disk(dev); - if (disk->nodename) - return disk->nodename(disk); + if (disk->devnode) + return disk->devnode(disk, mode); return NULL; } @@ -1011,7 +1011,7 @@ static struct device_type disk_type = { .name = "disk", .groups = disk_attr_groups, .release = disk_release, - .nodename = block_nodename, + .devnode = block_devnode, }; #ifdef CONFIG_PROC_FS diff --git a/drivers/base/core.c b/drivers/base/core.c index 390e664ec1c7..6bee6af8d8e1 100644 --- a/drivers/base/core.c +++ b/drivers/base/core.c @@ -166,13 +166,16 @@ static int dev_uevent(struct kset *kset, struct kobject *kobj, if (MAJOR(dev->devt)) { const char *tmp; const char *name; + mode_t mode = 0; add_uevent_var(env, "MAJOR=%u", MAJOR(dev->devt)); add_uevent_var(env, "MINOR=%u", MINOR(dev->devt)); - name = device_get_nodename(dev, &tmp); + name = device_get_devnode(dev, &mode, &tmp); if (name) { add_uevent_var(env, "DEVNAME=%s", name); kfree(tmp); + if (mode) + add_uevent_var(env, "DEVMODE=%#o", mode & 0777); } } @@ -1148,8 +1151,9 @@ static struct device *next_device(struct klist_iter *i) } /** - * device_get_nodename - path of device node file + * device_get_devnode - path of device node file * @dev: device + * @mode: returned file access mode * @tmp: possibly allocated string * * Return the relative path of a possible device node. @@ -1157,21 +1161,22 @@ static struct device *next_device(struct klist_iter *i) * a name. This memory is returned in tmp and needs to be * freed by the caller. */ -const char *device_get_nodename(struct device *dev, const char **tmp) +const char *device_get_devnode(struct device *dev, + mode_t *mode, const char **tmp) { char *s; *tmp = NULL; /* the device type may provide a specific name */ - if (dev->type && dev->type->nodename) - *tmp = dev->type->nodename(dev); + if (dev->type && dev->type->devnode) + *tmp = dev->type->devnode(dev, mode); if (*tmp) return *tmp; /* the class may provide a specific name */ - if (dev->class && dev->class->nodename) - *tmp = dev->class->nodename(dev); + if (dev->class && dev->class->devnode) + *tmp = dev->class->devnode(dev, mode); if (*tmp) return *tmp; diff --git a/drivers/base/devtmpfs.c b/drivers/base/devtmpfs.c index fd488ad4263a..a1cb5afe6801 100644 --- a/drivers/base/devtmpfs.c +++ b/drivers/base/devtmpfs.c @@ -6,9 +6,10 @@ * During bootup, before any driver core device is registered, * devtmpfs, a tmpfs-based filesystem is created. Every driver-core * device which requests a device node, will add a node in this - * filesystem. The node is named after the the name of the device, - * or the susbsytem can provide a custom name. All devices are - * owned by root and have a mode of 0600. + * filesystem. + * By default, all devices are named after the the name of the + * device, owned by root and have a default mode of 0600. Subsystems + * can overwrite the default setting if needed. */ #include @@ -20,6 +21,7 @@ #include #include #include +#include #include static struct vfsmount *dev_mnt; @@ -134,7 +136,7 @@ int devtmpfs_create_node(struct device *dev) const char *tmp = NULL; const char *nodename; const struct cred *curr_cred; - mode_t mode; + mode_t mode = 0; struct nameidata nd; struct dentry *dentry; int err; @@ -142,14 +144,16 @@ int devtmpfs_create_node(struct device *dev) if (!dev_mnt) return 0; - nodename = device_get_nodename(dev, &tmp); + nodename = device_get_devnode(dev, &mode, &tmp); if (!nodename) return -ENOMEM; + if (mode == 0) + mode = 0600; if (is_blockdev(dev)) - mode = S_IFBLK|0600; + mode |= S_IFBLK; else - mode = S_IFCHR|0600; + mode |= S_IFCHR; curr_cred = override_creds(&init_cred); err = vfs_path_lookup(dev_mnt->mnt_root, dev_mnt, @@ -165,8 +169,12 @@ int devtmpfs_create_node(struct device *dev) dentry = lookup_create(&nd, 0); if (!IS_ERR(dentry)) { + int umask; + + umask = sys_umask(0000); err = vfs_mknod(nd.path.dentry->d_inode, dentry, mode, dev->devt); + sys_umask(umask); /* mark as kernel created inode */ if (!err) dentry->d_inode->i_private = &dev_mnt; @@ -271,7 +279,7 @@ int devtmpfs_delete_node(struct device *dev) if (!dev_mnt) return 0; - nodename = device_get_nodename(dev, &tmp); + nodename = device_get_devnode(dev, NULL, &tmp); if (!nodename) return -ENOMEM; diff --git a/drivers/block/aoe/aoechr.c b/drivers/block/aoe/aoechr.c index 19888354188f..62141ec09a22 100644 --- a/drivers/block/aoe/aoechr.c +++ b/drivers/block/aoe/aoechr.c @@ -266,7 +266,7 @@ static const struct file_operations aoe_fops = { .owner = THIS_MODULE, }; -static char *aoe_nodename(struct device *dev) +static char *aoe_devnode(struct device *dev, mode_t *mode) { return kasprintf(GFP_KERNEL, "etherd/%s", dev_name(dev)); } @@ -288,7 +288,7 @@ aoechr_init(void) unregister_chrdev(AOE_MAJOR, "aoechr"); return PTR_ERR(aoe_class); } - aoe_class->nodename = aoe_nodename; + aoe_class->devnode = aoe_devnode; for (i = 0; i < ARRAY_SIZE(chardevs); ++i) device_create(aoe_class, NULL, diff --git a/drivers/block/pktcdvd.c b/drivers/block/pktcdvd.c index 95f11cdef203..fd5bb8ad59a9 100644 --- a/drivers/block/pktcdvd.c +++ b/drivers/block/pktcdvd.c @@ -2857,7 +2857,7 @@ static struct block_device_operations pktcdvd_ops = { .media_changed = pkt_media_changed, }; -static char *pktcdvd_nodename(struct gendisk *gd) +static char *pktcdvd_devnode(struct gendisk *gd, mode_t *mode) { return kasprintf(GFP_KERNEL, "pktcdvd/%s", gd->disk_name); } @@ -2914,7 +2914,7 @@ static int pkt_setup_dev(dev_t dev, dev_t* pkt_dev) disk->fops = &pktcdvd_ops; disk->flags = GENHD_FL_REMOVABLE; strcpy(disk->disk_name, pd->name); - disk->nodename = pktcdvd_nodename; + disk->devnode = pktcdvd_devnode; disk->private_data = pd; disk->queue = blk_alloc_queue(GFP_KERNEL); if (!disk->queue) @@ -3070,7 +3070,7 @@ static const struct file_operations pkt_ctl_fops = { static struct miscdevice pkt_misc = { .minor = MISC_DYNAMIC_MINOR, .name = DRIVER_NAME, - .name = "pktcdvd/control", + .nodename = "pktcdvd/control", .fops = &pkt_ctl_fops }; diff --git a/drivers/char/hw_random/core.c b/drivers/char/hw_random/core.c index fc93e2fc7c71..1573aebd54b5 100644 --- a/drivers/char/hw_random/core.c +++ b/drivers/char/hw_random/core.c @@ -153,7 +153,7 @@ static const struct file_operations rng_chrdev_ops = { static struct miscdevice rng_miscdev = { .minor = RNG_MISCDEV_MINOR, .name = RNG_MODULE_NAME, - .devnode = "hwrng", + .nodename = "hwrng", .fops = &rng_chrdev_ops, }; diff --git a/drivers/char/mem.c b/drivers/char/mem.c index 0491cdf63f2a..0aede1d6a9ea 100644 --- a/drivers/char/mem.c +++ b/drivers/char/mem.c @@ -866,24 +866,25 @@ static const struct file_operations kmsg_fops = { static const struct memdev { const char *name; + mode_t mode; const struct file_operations *fops; struct backing_dev_info *dev_info; } devlist[] = { - [ 1] = { "mem", &mem_fops, &directly_mappable_cdev_bdi }, + [1] = { "mem", 0, &mem_fops, &directly_mappable_cdev_bdi }, #ifdef CONFIG_DEVKMEM - [ 2] = { "kmem", &kmem_fops, &directly_mappable_cdev_bdi }, + [2] = { "kmem", 0, &kmem_fops, &directly_mappable_cdev_bdi }, #endif - [ 3] = {"null", &null_fops, NULL }, + [3] = { "null", 0666, &null_fops, NULL }, #ifdef CONFIG_DEVPORT - [ 4] = { "port", &port_fops, NULL }, + [4] = { "port", 0, &port_fops, NULL }, #endif - [ 5] = { "zero", &zero_fops, &zero_bdi }, - [ 7] = { "full", &full_fops, NULL }, - [ 8] = { "random", &random_fops, NULL }, - [ 9] = { "urandom", &urandom_fops, NULL }, - [11] = { "kmsg", &kmsg_fops, NULL }, + [5] = { "zero", 0666, &zero_fops, &zero_bdi }, + [7] = { "full", 0666, &full_fops, NULL }, + [8] = { "random", 0666, &random_fops, NULL }, + [9] = { "urandom", 0666, &urandom_fops, NULL }, + [11] = { "kmsg", 0, &kmsg_fops, NULL }, #ifdef CONFIG_CRASH_DUMP - [12] = { "oldmem", &oldmem_fops, NULL }, + [12] = { "oldmem", 0, &oldmem_fops, NULL }, #endif }; @@ -920,6 +921,13 @@ static const struct file_operations memory_fops = { .open = memory_open, }; +static char *mem_devnode(struct device *dev, mode_t *mode) +{ + if (mode && devlist[MINOR(dev->devt)].mode) + *mode = devlist[MINOR(dev->devt)].mode; + return NULL; +} + static struct class *mem_class; static int __init chr_dev_init(void) @@ -935,6 +943,7 @@ static int __init chr_dev_init(void) printk("unable to get major %d for memory devs\n", MEM_MAJOR); mem_class = class_create(THIS_MODULE, "mem"); + mem_class->devnode = mem_devnode; for (minor = 1; minor < ARRAY_SIZE(devlist); minor++) { if (!devlist[minor].name) continue; diff --git a/drivers/char/misc.c b/drivers/char/misc.c index 62c99fa59e2b..1ee27cc23426 100644 --- a/drivers/char/misc.c +++ b/drivers/char/misc.c @@ -263,12 +263,14 @@ int misc_deregister(struct miscdevice *misc) EXPORT_SYMBOL(misc_register); EXPORT_SYMBOL(misc_deregister); -static char *misc_nodename(struct device *dev) +static char *misc_devnode(struct device *dev, mode_t *mode) { struct miscdevice *c = dev_get_drvdata(dev); - if (c->devnode) - return kstrdup(c->devnode, GFP_KERNEL); + if (mode && c->mode) + *mode = c->mode; + if (c->nodename) + return kstrdup(c->nodename, GFP_KERNEL); return NULL; } @@ -287,7 +289,7 @@ static int __init misc_init(void) err = -EIO; if (register_chrdev(MISC_MAJOR,"misc",&misc_fops)) goto fail_printk; - misc_class->nodename = misc_nodename; + misc_class->devnode = misc_devnode; return 0; fail_printk: diff --git a/drivers/char/raw.c b/drivers/char/raw.c index 40268db02e22..64acd05f71c8 100644 --- a/drivers/char/raw.c +++ b/drivers/char/raw.c @@ -261,7 +261,7 @@ static const struct file_operations raw_ctl_fops = { static struct cdev raw_cdev; -static char *raw_nodename(struct device *dev) +static char *raw_devnode(struct device *dev, mode_t *mode) { return kasprintf(GFP_KERNEL, "raw/%s", dev_name(dev)); } @@ -289,7 +289,7 @@ static int __init raw_init(void) ret = PTR_ERR(raw_class); goto error_region; } - raw_class->nodename = raw_nodename; + raw_class->devnode = raw_devnode; device_create(raw_class, NULL, MKDEV(RAW_MAJOR, 0), NULL, "rawctl"); return 0; diff --git a/drivers/char/tty_io.c b/drivers/char/tty_io.c index a3afa0c387cd..c70d9dabefae 100644 --- a/drivers/char/tty_io.c +++ b/drivers/char/tty_io.c @@ -3056,11 +3056,22 @@ void __init console_init(void) } } +static char *tty_devnode(struct device *dev, mode_t *mode) +{ + if (!mode) + return NULL; + if (dev->devt == MKDEV(TTYAUX_MAJOR, 0) || + dev->devt == MKDEV(TTYAUX_MAJOR, 2)) + *mode = 0666; + return NULL; +} + static int __init tty_class_init(void) { tty_class = class_create(THIS_MODULE, "tty"); if (IS_ERR(tty_class)) return PTR_ERR(tty_class); + tty_class->devnode = tty_devnode; return 0; } diff --git a/drivers/gpu/drm/drm_sysfs.c b/drivers/gpu/drm/drm_sysfs.c index f7a615b80c70..5301f226cb1c 100644 --- a/drivers/gpu/drm/drm_sysfs.c +++ b/drivers/gpu/drm/drm_sysfs.c @@ -76,7 +76,7 @@ static ssize_t version_show(struct class *dev, char *buf) CORE_MINOR, CORE_PATCHLEVEL, CORE_DATE); } -static char *drm_nodename(struct device *dev) +static char *drm_devnode(struct device *dev, mode_t *mode) { return kasprintf(GFP_KERNEL, "dri/%s", dev_name(dev)); } @@ -112,7 +112,7 @@ struct class *drm_sysfs_create(struct module *owner, char *name) if (err) goto err_out_class; - class->nodename = drm_nodename; + class->devnode = drm_devnode; return class; diff --git a/drivers/hid/usbhid/hiddev.c b/drivers/hid/usbhid/hiddev.c index 4d1dc0cf1401..8b6ee247bfe4 100644 --- a/drivers/hid/usbhid/hiddev.c +++ b/drivers/hid/usbhid/hiddev.c @@ -852,14 +852,14 @@ static const struct file_operations hiddev_fops = { #endif }; -static char *hiddev_nodename(struct device *dev) +static char *hiddev_devnode(struct device *dev, mode_t *mode) { return kasprintf(GFP_KERNEL, "usb/%s", dev_name(dev)); } static struct usb_class_driver hiddev_class = { .name = "hiddev%d", - .nodename = hiddev_nodename, + .devnode = hiddev_devnode, .fops = &hiddev_fops, .minor_base = HIDDEV_MINOR_BASE, }; diff --git a/drivers/input/input.c b/drivers/input/input.c index 851791d955f3..556539d617a4 100644 --- a/drivers/input/input.c +++ b/drivers/input/input.c @@ -1265,14 +1265,14 @@ static struct device_type input_dev_type = { .uevent = input_dev_uevent, }; -static char *input_nodename(struct device *dev) +static char *input_devnode(struct device *dev, mode_t *mode) { return kasprintf(GFP_KERNEL, "input/%s", dev_name(dev)); } struct class input_class = { .name = "input", - .nodename = input_nodename, + .devnode = input_devnode, }; EXPORT_SYMBOL_GPL(input_class); diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c index 7f77f18fcafa..a67942931582 100644 --- a/drivers/md/dm-ioctl.c +++ b/drivers/md/dm-ioctl.c @@ -1532,7 +1532,7 @@ static const struct file_operations _ctl_fops = { static struct miscdevice _dm_misc = { .minor = MISC_DYNAMIC_MINOR, .name = DM_NAME, - .devnode = "mapper/control", + .nodename = "mapper/control", .fops = &_ctl_fops }; diff --git a/drivers/media/dvb/dvb-core/dvbdev.c b/drivers/media/dvb/dvb-core/dvbdev.c index 479dd05762a5..94159b90f733 100644 --- a/drivers/media/dvb/dvb-core/dvbdev.c +++ b/drivers/media/dvb/dvb-core/dvbdev.c @@ -447,7 +447,7 @@ static int dvb_uevent(struct device *dev, struct kobj_uevent_env *env) return 0; } -static char *dvb_nodename(struct device *dev) +static char *dvb_devnode(struct device *dev, mode_t *mode) { struct dvb_device *dvbdev = dev_get_drvdata(dev); @@ -478,7 +478,7 @@ static int __init init_dvbdev(void) goto error; } dvb_class->dev_uevent = dvb_uevent; - dvb_class->nodename = dvb_nodename; + dvb_class->devnode = dvb_devnode; return 0; error: diff --git a/drivers/net/tun.c b/drivers/net/tun.c index 3f5d28851aa2..d3ee1994b02f 100644 --- a/drivers/net/tun.c +++ b/drivers/net/tun.c @@ -1370,7 +1370,7 @@ static const struct file_operations tun_fops = { static struct miscdevice tun_miscdev = { .minor = TUN_MINOR, .name = "tun", - .devnode = "net/tun", + .nodename = "net/tun", .fops = &tun_fops, }; diff --git a/drivers/usb/class/usblp.c b/drivers/usb/class/usblp.c index 26c09f0257db..9bc112ee7803 100644 --- a/drivers/usb/class/usblp.c +++ b/drivers/usb/class/usblp.c @@ -1057,14 +1057,14 @@ static const struct file_operations usblp_fops = { .release = usblp_release, }; -static char *usblp_nodename(struct device *dev) +static char *usblp_devnode(struct device *dev, mode_t *mode) { return kasprintf(GFP_KERNEL, "usb/%s", dev_name(dev)); } static struct usb_class_driver usblp_class = { .name = "lp%d", - .nodename = usblp_nodename, + .devnode = usblp_devnode, .fops = &usblp_fops, .minor_base = USBLP_MINOR_BASE, }; diff --git a/drivers/usb/core/file.c b/drivers/usb/core/file.c index 5cef88929b3e..222ee07ea680 100644 --- a/drivers/usb/core/file.c +++ b/drivers/usb/core/file.c @@ -67,14 +67,14 @@ static struct usb_class { struct class *class; } *usb_class; -static char *usb_nodename(struct device *dev) +static char *usb_devnode(struct device *dev, mode_t *mode) { struct usb_class_driver *drv; drv = dev_get_drvdata(dev); - if (!drv || !drv->nodename) + if (!drv || !drv->devnode) return NULL; - return drv->nodename(dev); + return drv->devnode(dev, mode); } static int init_usb_class(void) @@ -100,7 +100,7 @@ static int init_usb_class(void) kfree(usb_class); usb_class = NULL; } - usb_class->class->nodename = usb_nodename; + usb_class->class->devnode = usb_devnode; exit: return result; diff --git a/drivers/usb/core/usb.c b/drivers/usb/core/usb.c index a26f73880c32..43ee943d757a 100644 --- a/drivers/usb/core/usb.c +++ b/drivers/usb/core/usb.c @@ -311,7 +311,7 @@ static struct dev_pm_ops usb_device_pm_ops = { #endif /* CONFIG_PM */ -static char *usb_nodename(struct device *dev) +static char *usb_devnode(struct device *dev, mode_t *mode) { struct usb_device *usb_dev; @@ -324,7 +324,7 @@ struct device_type usb_device_type = { .name = "usb_device", .release = usb_release_dev, .uevent = usb_dev_uevent, - .nodename = usb_nodename, + .devnode = usb_devnode, .pm = &usb_device_pm_ops, }; diff --git a/drivers/usb/misc/iowarrior.c b/drivers/usb/misc/iowarrior.c index 90e1a8dedfa9..e75bb87ee92b 100644 --- a/drivers/usb/misc/iowarrior.c +++ b/drivers/usb/misc/iowarrior.c @@ -727,7 +727,7 @@ static const struct file_operations iowarrior_fops = { .poll = iowarrior_poll, }; -static char *iowarrior_nodename(struct device *dev) +static char *iowarrior_devnode(struct device *dev, mode_t *mode) { return kasprintf(GFP_KERNEL, "usb/%s", dev_name(dev)); } @@ -738,7 +738,7 @@ static char *iowarrior_nodename(struct device *dev) */ static struct usb_class_driver iowarrior_class = { .name = "iowarrior%d", - .nodename = iowarrior_nodename, + .devnode = iowarrior_devnode, .fops = &iowarrior_fops, .minor_base = IOWARRIOR_MINOR_BASE, }; diff --git a/drivers/usb/misc/legousbtower.c b/drivers/usb/misc/legousbtower.c index c1e2433f640d..97efeaec4d52 100644 --- a/drivers/usb/misc/legousbtower.c +++ b/drivers/usb/misc/legousbtower.c @@ -266,7 +266,7 @@ static const struct file_operations tower_fops = { .llseek = tower_llseek, }; -static char *legousbtower_nodename(struct device *dev) +static char *legousbtower_devnode(struct device *dev, mode_t *mode) { return kasprintf(GFP_KERNEL, "usb/%s", dev_name(dev)); } @@ -277,7 +277,7 @@ static char *legousbtower_nodename(struct device *dev) */ static struct usb_class_driver tower_class = { .name = "legousbtower%d", - .nodename = legousbtower_nodename, + .devnode = legousbtower_devnode, .fops = &tower_fops, .minor_base = LEGO_USB_TOWER_MINOR_BASE, }; diff --git a/include/linux/device.h b/include/linux/device.h index 847b763e40e9..aca31bf7d8ed 100644 --- a/include/linux/device.h +++ b/include/linux/device.h @@ -193,7 +193,7 @@ struct class { struct kobject *dev_kobj; int (*dev_uevent)(struct device *dev, struct kobj_uevent_env *env); - char *(*nodename)(struct device *dev); + char *(*devnode)(struct device *dev, mode_t *mode); void (*class_release)(struct class *class); void (*dev_release)(struct device *dev); @@ -298,7 +298,7 @@ struct device_type { const char *name; const struct attribute_group **groups; int (*uevent)(struct device *dev, struct kobj_uevent_env *env); - char *(*nodename)(struct device *dev); + char *(*devnode)(struct device *dev, mode_t *mode); void (*release)(struct device *dev); const struct dev_pm_ops *pm; @@ -487,7 +487,8 @@ extern struct device *device_find_child(struct device *dev, void *data, extern int device_rename(struct device *dev, char *new_name); extern int device_move(struct device *dev, struct device *new_parent, enum dpm_order dpm_order); -extern const char *device_get_nodename(struct device *dev, const char **tmp); +extern const char *device_get_devnode(struct device *dev, + mode_t *mode, const char **tmp); extern void *dev_get_drvdata(const struct device *dev); extern void dev_set_drvdata(struct device *dev, void *data); diff --git a/include/linux/genhd.h b/include/linux/genhd.h index 44263cb27121..109d179adb93 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -142,7 +142,7 @@ struct gendisk { * disks that can't be partitioned. */ char disk_name[DISK_NAME_LEN]; /* name of major driver */ - char *(*nodename)(struct gendisk *gd); + char *(*devnode)(struct gendisk *gd, mode_t *mode); /* Array of pointers to partitions indexed by partno. * Protected with matching bdev lock but stat and other * non-critical accesses use RCU. Always access through diff --git a/include/linux/miscdevice.h b/include/linux/miscdevice.h index 052117744629..adaf3c15e449 100644 --- a/include/linux/miscdevice.h +++ b/include/linux/miscdevice.h @@ -41,7 +41,8 @@ struct miscdevice { struct list_head list; struct device *parent; struct device *this_device; - const char *devnode; + const char *nodename; + mode_t mode; }; extern int misc_register(struct miscdevice * misc); diff --git a/include/linux/usb.h b/include/linux/usb.h index b1e3c2fbfe11..a8fe05f224e5 100644 --- a/include/linux/usb.h +++ b/include/linux/usb.h @@ -922,7 +922,7 @@ extern struct bus_type usb_bus_type; /** * struct usb_class_driver - identifies a USB driver that wants to use the USB major number * @name: the usb class device name for this driver. Will show up in sysfs. - * @nodename: Callback to provide a naming hint for a possible + * @devnode: Callback to provide a naming hint for a possible * device node to create. * @fops: pointer to the struct file_operations of this driver. * @minor_base: the start of the minor range for this driver. @@ -933,7 +933,7 @@ extern struct bus_type usb_bus_type; */ struct usb_class_driver { char *name; - char *(*nodename)(struct device *dev); + char *(*devnode)(struct device *dev, mode_t *mode); const struct file_operations *fops; int minor_base; }; diff --git a/sound/sound_core.c b/sound/sound_core.c index bb4b88e606bb..49c998186592 100644 --- a/sound/sound_core.c +++ b/sound/sound_core.c @@ -29,7 +29,7 @@ MODULE_DESCRIPTION("Core sound module"); MODULE_AUTHOR("Alan Cox"); MODULE_LICENSE("GPL"); -static char *sound_nodename(struct device *dev) +static char *sound_devnode(struct device *dev, mode_t *mode) { if (MAJOR(dev->devt) == SOUND_MAJOR) return NULL; @@ -50,7 +50,7 @@ static int __init init_soundcore(void) return PTR_ERR(sound_class); } - sound_class->nodename = sound_nodename; + sound_class->devnode = sound_devnode; return 0; } -- cgit v1.2.3 From 144374dcc3ad0436f0a1bb3095836cf0ec32eebe Mon Sep 17 00:00:00 2001 From: Jaswinder Singh Rajput Date: Sun, 20 Sep 2009 15:09:26 +0530 Subject: includecheck fix: x86, traps.c fix the following 'make includecheck' warning: arch/x86/kernel/traps.c: asm/traps.h is included more than once. Signed-off-by: Jaswinder Singh Rajput Cc: Ingo Molnar Cc: Sam Ravnborg LKML-Reference: <1247065094.4382.49.camel@ht.satnam> --- arch/x86/kernel/traps.c | 1 - 1 file changed, 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 7dc0de9d1ed9..9346e102338d 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -65,7 +65,6 @@ #else #include #include -#include asmlinkage int system_call(void); -- cgit v1.2.3 From 5ac7687860dbfd3dd90e09d2c10dd31de91f20c2 Mon Sep 17 00:00:00 2001 From: Jaswinder Singh Rajput Date: Sun, 20 Sep 2009 16:34:24 +0530 Subject: includecheck fix: x86, cpu/common.c fix the following 'make includecheck' warning: arch/x86/kernel/cpu/common.c: linux/smp.h is included more than once. Signed-off-by: Jaswinder Singh Rajput Cc: Alan Cox Cc: Ingo Molnar LKML-Reference: <1252087783.6385.10.camel@ht.satnam> --- arch/x86/kernel/cpu/common.c | 1 - 1 file changed, 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 2055fc2b2e6b..2fea97eccf77 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -34,7 +34,6 @@ #include #include #include -#include #ifdef CONFIG_X86_LOCAL_APIC #include -- cgit v1.2.3 From a1792cdacaf5180e04e07811e220c4a3b4a9c33e Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 9 Sep 2009 10:04:47 +0200 Subject: perf_counter: x86: Fix PMU resource leak Dave noticed that we leak the PMU resource reservations when we fail the hardware counter init. Reported-by: David Miller Signed-off-by: Peter Zijlstra Acked-by: David Miller LKML-Reference: <1252483487.7746.164.camel@twins> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index dbdf712fae9e..a6c8b27553cd 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -924,6 +924,8 @@ static int __hw_perf_counter_init(struct perf_counter *counter) if (err) return err; + counter->destroy = hw_perf_counter_destroy; + /* * Generate PMC IRQs: * (keep 'enabled' bit clear for now) @@ -953,8 +955,6 @@ static int __hw_perf_counter_init(struct perf_counter *counter) return -EOPNOTSUPP; } - counter->destroy = hw_perf_counter_destroy; - /* * Raw event type provide the config in the event structure */ @@ -2107,8 +2107,11 @@ const struct pmu *hw_perf_counter_init(struct perf_counter *counter) int err; err = __hw_perf_counter_init(counter); - if (err) + if (err) { + if (counter->destroy) + counter->destroy(counter); return ERR_PTR(err); + } return &pmu; } -- cgit v1.2.3 From 9ff6d8e06fbd9249804d43574167c2acc52886be Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Mon, 14 Sep 2009 09:28:07 -0700 Subject: x86, mtrr: Convert loop to a while based construct, avoid naked semicolon Perhaps this is a more readable/standard form. Signed-off-by: Joe Perches LKML-Reference: <1252945687.3937.14.camel@Joe-Laptop.home> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/mtrr/if.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/mtrr/if.c b/arch/x86/kernel/cpu/mtrr/if.c index 08b6ea4c62b4..f04e72527604 100644 --- a/arch/x86/kernel/cpu/mtrr/if.c +++ b/arch/x86/kernel/cpu/mtrr/if.c @@ -126,8 +126,8 @@ mtrr_write(struct file *file, const char __user *buf, size_t len, loff_t * ppos) return -EINVAL; base = simple_strtoull(line + 5, &ptr, 0); - for (; isspace(*ptr); ++ptr) - ; + while (isspace(*ptr)) + ptr++; if (strncmp(ptr, "size=", 5)) return -EINVAL; @@ -135,14 +135,14 @@ mtrr_write(struct file *file, const char __user *buf, size_t len, loff_t * ppos) size = simple_strtoull(ptr + 5, &ptr, 0); if ((base & 0xfff) || (size & 0xfff)) return -EINVAL; - for (; isspace(*ptr); ++ptr) - ; + while (isspace(*ptr)) + ptr++; if (strncmp(ptr, "type=", 5)) return -EINVAL; ptr += 5; - for (; isspace(*ptr); ++ptr) - ; + while (isspace(*ptr)) + ptr++; for (i = 0; i < MTRR_NUM_TYPES; ++i) { if (strcmp(ptr, mtrr_strings[i])) -- cgit v1.2.3 From efc8f7419ea0a97c38d0b80785c8c3d9ab8a4872 Mon Sep 17 00:00:00 2001 From: Huang Weiyi Date: Fri, 18 Sep 2009 06:39:15 +0800 Subject: x86/i386: Remove duplicated #include Remove duplicated #include in: arch/x86/kernel/cpu/common.c Signed-off-by: Huang Weiyi Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/common.c | 1 - 1 file changed, 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 2055fc2b2e6b..2fea97eccf77 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -34,7 +34,6 @@ #include #include #include -#include #ifdef CONFIG_X86_LOCAL_APIC #include -- cgit v1.2.3 From 8312136fa8b0a3ec7323bbb1a46be8c0c26e994e Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Tue, 15 Sep 2009 11:12:30 +0400 Subject: x86, apic: Fix missed handling of discrete apics In case of discrete (pretty old) apics we may have cpu_has_apic bit not set but have to check if smp_found_config (MP spec) is there and apic was not disabled. Also don't forget to print apic/io-apic for such case as well. Signed-off-by: Cyrill Gorcunov Cc: "Maciej W. Rozycki" Cc: Yinghai Lu LKML-Reference: <20090915071230.GA10604@lenovo> Signed-off-by: Ingo Molnar --- arch/x86/include/asm/apic.h | 13 +++++++++++++ arch/x86/kernel/apic/apic.c | 2 +- arch/x86/kernel/apic/io_apic.c | 4 ++-- 3 files changed, 16 insertions(+), 3 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h index 586b7adb8e53..9a86fb40490d 100644 --- a/arch/x86/include/asm/apic.h +++ b/arch/x86/include/asm/apic.h @@ -65,6 +65,19 @@ static inline void default_inquire_remote_apic(int apicid) __inquire_remote_apic(apicid); } +/* + * With 82489DX we can't rely on apic feature bit + * retrieved via cpuid but still have to deal with + * such an apic chip so we assume that SMP configuration + * is found from MP table (64bit case uses ACPI mostly + * which set smp presence flag as well so we are safe + * to use this helper too). + */ +static inline bool apic_from_smp_config(void) +{ + return smp_found_config && !disable_apic; +} + /* * Basic functions accessing APICs. */ diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index 072aea6c630b..2d59dfa29785 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -978,7 +978,7 @@ void lapic_shutdown(void) { unsigned long flags; - if (!cpu_has_apic) + if (!cpu_has_apic && !apic_from_smp_config()) return; local_irq_save(flags); diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 3c8f9e75d038..593c4f8a8bb8 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -1863,7 +1863,7 @@ __apicdebuginit(int) print_all_ICs(void) print_PIC(); /* don't print out if apic is not there */ - if (!cpu_has_apic || disable_apic) + if (!cpu_has_apic && !apic_from_smp_config()) return 0; print_all_local_APICs(); @@ -1981,7 +1981,7 @@ void disable_IO_APIC(void) /* * Use virtual wire A mode when interrupt remapping is enabled. */ - if (cpu_has_apic) + if (cpu_has_apic || apic_from_smp_config()) disconnect_bsp_APIC(!intr_remapping_enabled && ioapic_i8259.pin != -1); } -- cgit v1.2.3 From 878f4f533e5b4498215e67e0f886b0fc81417f5e Mon Sep 17 00:00:00 2001 From: Felipe Contreras Date: Thu, 17 Sep 2009 00:38:38 +0300 Subject: x86: Trivial whitespace cleanups Signed-off-by: Felipe Contreras Cc: Vegard Nossum Cc: Pekka Enberg Cc: Andrew Morton Cc: Alok N Kataria Cc: "Tan Wei Chong" Cc: Len Brown Cc: Lin Ming Cc: Bob Moore LKML-Reference: <1253137123-18047-2-git-send-email-felipe.contreras@gmail.com> Signed-off-by: Ingo Molnar --- arch/x86/include/asm/string_32.h | 1 - arch/x86/kernel/tsc.c | 2 +- drivers/acpi/acpica/tbfadt.c | 1 - 3 files changed, 1 insertion(+), 3 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/include/asm/string_32.h b/arch/x86/include/asm/string_32.h index c86f452256de..ae907e617181 100644 --- a/arch/x86/include/asm/string_32.h +++ b/arch/x86/include/asm/string_32.h @@ -65,7 +65,6 @@ static __always_inline void *__constant_memcpy(void *to, const void *from, case 4: *(int *)to = *(int *)from; return to; - case 3: *(short *)to = *(short *)from; *((char *)to + 2) = *((char *)from + 2); diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index 71f4368b357e..8a2fc1123690 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -670,7 +670,7 @@ static int time_cpufreq_notifier(struct notifier_block *nb, unsigned long val, if ((val == CPUFREQ_PRECHANGE && freq->old < freq->new) || (val == CPUFREQ_POSTCHANGE && freq->old > freq->new) || (val == CPUFREQ_RESUMECHANGE)) { - *lpj = cpufreq_scale(loops_per_jiffy_ref, ref_freq, freq->new); + *lpj = cpufreq_scale(loops_per_jiffy_ref, ref_freq, freq->new); tsc_khz = cpufreq_scale(tsc_khz_ref, ref_freq, freq->new); if (!(freq->flags & CPUFREQ_CONST_LOOPS)) diff --git a/drivers/acpi/acpica/tbfadt.c b/drivers/acpi/acpica/tbfadt.c index 82b02dcb942e..c016335fb759 100644 --- a/drivers/acpi/acpica/tbfadt.c +++ b/drivers/acpi/acpica/tbfadt.c @@ -275,7 +275,6 @@ void acpi_tb_parse_fadt(u32 table_index) void acpi_tb_create_local_fadt(struct acpi_table_header *table, u32 length) { - /* * Check if the FADT is larger than the largest table that we expect * (the ACPI 2.0/3.0 version). If so, truncate the table, and issue -- cgit v1.2.3 From 414128bd33fef3242747154dfa444970bc5caf4c Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Fri, 4 Sep 2009 09:13:49 +0100 Subject: x86: Increase timeout for EHCI debug port reset completion in early printk On one of my systems, several thousand iterations are needed before CMD_RESET can be observed clear after setting it. Using a much higher value here obviously cannot hurt. Signed-off-by: Jan Beulich Cc: Yinghai Lu LKML-Reference: <4AA0E85D02000078000136F9@vpn.id2.novell.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/early_printk.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/early_printk.c b/arch/x86/kernel/early_printk.c index 335f049d110f..b11cab3c323a 100644 --- a/arch/x86/kernel/early_printk.c +++ b/arch/x86/kernel/early_printk.c @@ -624,7 +624,7 @@ try_next_port: return -1; } - loop = 10; + loop = 100000; /* Reset the EHCI controller */ cmd = readl(&ehci_regs->command); cmd |= CMD_RESET; -- cgit v1.2.3 From 5f68563996e812f9ca35b3939ad2a42e5d254d66 Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Fri, 4 Sep 2009 09:16:22 +0100 Subject: x86: cpuinit-annotate SMP boot trampolines properly Add missing annotations, and make use of include/linux/init.h's macros. Signed-off-by: Jan Beulich LKML-Reference: <4AA0E8F60200007800013703@vpn.id2.novell.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/trampoline.c | 4 ++-- arch/x86/kernel/trampoline_32.S | 8 ++------ arch/x86/kernel/trampoline_64.S | 5 +++-- 3 files changed, 7 insertions(+), 10 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/trampoline.c b/arch/x86/kernel/trampoline.c index 808031a5ba19..699f7eeb896a 100644 --- a/arch/x86/kernel/trampoline.c +++ b/arch/x86/kernel/trampoline.c @@ -4,7 +4,7 @@ #include /* ready for x86_64 and x86 */ -unsigned char *trampoline_base = __va(TRAMPOLINE_BASE); +unsigned char *__cpuinitdata trampoline_base = __va(TRAMPOLINE_BASE); void __init reserve_trampoline_memory(void) { @@ -26,7 +26,7 @@ void __init reserve_trampoline_memory(void) * bootstrap into the page concerned. The caller * has made sure it's suitably aligned. */ -unsigned long setup_trampoline(void) +unsigned long __cpuinit setup_trampoline(void) { memcpy(trampoline_base, trampoline_data, TRAMPOLINE_SIZE); return virt_to_phys(trampoline_base); diff --git a/arch/x86/kernel/trampoline_32.S b/arch/x86/kernel/trampoline_32.S index 66d874e5404c..8508237e8e43 100644 --- a/arch/x86/kernel/trampoline_32.S +++ b/arch/x86/kernel/trampoline_32.S @@ -28,16 +28,12 @@ */ #include +#include #include #include /* We can free up trampoline after bootup if cpu hotplug is not supported. */ -#ifndef CONFIG_HOTPLUG_CPU -.section ".cpuinit.data","aw",@progbits -#else -.section .rodata,"a",@progbits -#endif - +__CPUINITRODATA .code16 ENTRY(trampoline_data) diff --git a/arch/x86/kernel/trampoline_64.S b/arch/x86/kernel/trampoline_64.S index cddfb8d386b9..596d54c660a5 100644 --- a/arch/x86/kernel/trampoline_64.S +++ b/arch/x86/kernel/trampoline_64.S @@ -25,14 +25,15 @@ */ #include +#include #include #include #include #include #include -.section .rodata, "a", @progbits - +/* We can free up the trampoline after bootup if cpu hotplug is not supported. */ +__CPUINITRODATA .code16 ENTRY(trampoline_data) -- cgit v1.2.3 From 8d0cc631f6dd0a9283ceb7d61d8b85ecbcd355ea Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Fri, 4 Sep 2009 09:18:07 +0100 Subject: x86: Correct segment permission flags in 64-bit linker script While these don't get actively used (afaict), it still doesn't hurt for them to properly reflect what how respective segments will get mapped/ accessed. Signed-off-by: Jan Beulich LKML-Reference: <4AA0E95F0200007800013707@vpn.id2.novell.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/vmlinux.lds.S | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index 0ccb57d5ee35..a46acccec38a 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -45,9 +45,9 @@ PHDRS { text PT_LOAD FLAGS(5); /* R_E */ data PT_LOAD FLAGS(7); /* RWE */ #ifdef CONFIG_X86_64 - user PT_LOAD FLAGS(7); /* RWE */ + user PT_LOAD FLAGS(5); /* R_E */ #ifdef CONFIG_SMP - percpu PT_LOAD FLAGS(7); /* RWE */ + percpu PT_LOAD FLAGS(6); /* RW_ */ #endif init PT_LOAD FLAGS(7); /* RWE */ #endif -- cgit v1.2.3 From 6399c087458859cddff2d6b46befb95b866df3e0 Mon Sep 17 00:00:00 2001 From: Alok Kataria Date: Fri, 4 Sep 2009 13:13:39 -0700 Subject: x86: Print the hypervisor returned tsc_khz during boot On an AMD-64 system the processor frequency that is printed during system boot, may be different than the tsc frequency that was returned by the hypervisor, due to the value returned from calibrate_cpu. For debugging timekeeping or other related issues it might be better to get the tsc_khz value returned by the hypervisor. The patch below now prints the tsc frequency that the VMware hypervisor returned. Signed-off-by: Alok N Kataria LKML-Reference: <1252095219.12518.13.camel@ank32.eng.vmware.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/vmware.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/vmware.c b/arch/x86/kernel/cpu/vmware.c index 0a46b4df5d80..1cbed97b59cf 100644 --- a/arch/x86/kernel/cpu/vmware.c +++ b/arch/x86/kernel/cpu/vmware.c @@ -58,6 +58,9 @@ static unsigned long vmware_get_tsc_khz(void) tsc_hz = eax | (((uint64_t)ebx) << 32); do_div(tsc_hz, 1000); BUG_ON(tsc_hz >> 32); + printk(KERN_INFO "TSC freq read from hypervisor : %lu.%03lu MHz\n", + (unsigned long) tsc_hz / 1000, + (unsigned long) tsc_hz % 1000); return tsc_hz; } @@ -69,6 +72,9 @@ void __init vmware_platform_setup(void) if (ebx != UINT_MAX) x86_platform.calibrate_tsc = vmware_get_tsc_khz; + else + printk(KERN_WARNING + "Failed to get TSC freq from the hypervisor\n"); } /* -- cgit v1.2.3 From d200c922bc2b1ac88b8d33b6cfff2ed837af186a Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Sun, 20 Sep 2009 18:14:13 -0400 Subject: Use new __init_task_data macro in arch init_task.c files. Signed-off-by: Joe Perches Signed-off-by: Tim Abbott Acked-by: Paul Mundt Signed-off-by: Sam Ravnborg --- arch/arm/kernel/init_task.c | 5 ++--- arch/avr32/kernel/init_task.c | 5 ++--- arch/cris/kernel/process.c | 5 ++--- arch/frv/kernel/init_task.c | 5 ++--- arch/h8300/kernel/init_task.c | 5 ++--- arch/ia64/kernel/init_task.c | 3 ++- arch/m32r/kernel/init_task.c | 5 ++--- arch/m68k/kernel/process.c | 6 +++--- arch/m68knommu/kernel/init_task.c | 5 ++--- arch/microblaze/kernel/init_task.c | 5 ++--- arch/mips/kernel/init_task.c | 5 ++--- arch/mn10300/kernel/init_task.c | 5 ++--- arch/parisc/kernel/init_task.c | 4 ++-- arch/powerpc/kernel/init_task.c | 5 ++--- arch/powerpc/kernel/machine_kexec_64.c | 5 +++-- arch/s390/kernel/init_task.c | 5 ++--- arch/score/kernel/init_task.c | 5 ++--- arch/sh/kernel/init_task.c | 5 ++--- arch/sparc/kernel/init_task.c | 5 ++--- arch/um/kernel/init_task.c | 5 ++--- arch/x86/kernel/init_task.c | 5 ++--- arch/xtensa/kernel/init_task.c | 5 ++--- 22 files changed, 46 insertions(+), 62 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/arm/kernel/init_task.c b/arch/arm/kernel/init_task.c index 3f470866bb89..e7cbb50dc356 100644 --- a/arch/arm/kernel/init_task.c +++ b/arch/arm/kernel/init_task.c @@ -24,9 +24,8 @@ static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand); * * The things we do for performance.. */ -union thread_union init_thread_union - __attribute__((__section__(".data.init_task"))) = - { INIT_THREAD_INFO(init_task) }; +union thread_union init_thread_union __init_task_data = + { INIT_THREAD_INFO(init_task) }; /* * Initial task structure. diff --git a/arch/avr32/kernel/init_task.c b/arch/avr32/kernel/init_task.c index 57ec9f2dcd95..6b2343e6fe33 100644 --- a/arch/avr32/kernel/init_task.c +++ b/arch/avr32/kernel/init_task.c @@ -18,9 +18,8 @@ static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand); /* * Initial thread structure. Must be aligned on an 8192-byte boundary. */ -union thread_union init_thread_union - __attribute__((__section__(".data.init_task"))) = - { INIT_THREAD_INFO(init_task) }; +union thread_union init_thread_union __init_task_data = + { INIT_THREAD_INFO(init_task) }; /* * Initial task structure. diff --git a/arch/cris/kernel/process.c b/arch/cris/kernel/process.c index 51dcd04d2777..c99aeab7cef7 100644 --- a/arch/cris/kernel/process.c +++ b/arch/cris/kernel/process.c @@ -45,9 +45,8 @@ static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand); * way process stacks are handled. This is done by having a special * "init_task" linker map entry.. */ -union thread_union init_thread_union - __attribute__((__section__(".data.init_task"))) = - { INIT_THREAD_INFO(init_task) }; +union thread_union init_thread_union __init_task_data = + { INIT_THREAD_INFO(init_task) }; /* * Initial task structure. diff --git a/arch/frv/kernel/init_task.c b/arch/frv/kernel/init_task.c index 1d3df1d9495c..3c3e0b336a9d 100644 --- a/arch/frv/kernel/init_task.c +++ b/arch/frv/kernel/init_task.c @@ -19,9 +19,8 @@ static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand); * way process stacks are handled. This is done by having a special * "init_task" linker map entry.. */ -union thread_union init_thread_union - __attribute__((__section__(".data.init_task"))) = - { INIT_THREAD_INFO(init_task) }; +union thread_union init_thread_union __init_task_data = + { INIT_THREAD_INFO(init_task) }; /* * Initial task structure. diff --git a/arch/h8300/kernel/init_task.c b/arch/h8300/kernel/init_task.c index 089c65ed6eb3..54c1062ee80e 100644 --- a/arch/h8300/kernel/init_task.c +++ b/arch/h8300/kernel/init_task.c @@ -31,7 +31,6 @@ EXPORT_SYMBOL(init_task); * way process stacks are handled. This is done by having a special * "init_task" linker map entry.. */ -union thread_union init_thread_union - __attribute__((__section__(".data.init_task"))) = - { INIT_THREAD_INFO(init_task) }; +union thread_union init_thread_union __init_task_data = + { INIT_THREAD_INFO(init_task) }; diff --git a/arch/ia64/kernel/init_task.c b/arch/ia64/kernel/init_task.c index c475fc281be7..e253ab8fcbc8 100644 --- a/arch/ia64/kernel/init_task.c +++ b/arch/ia64/kernel/init_task.c @@ -33,7 +33,8 @@ union { struct thread_info thread_info; } s; unsigned long stack[KERNEL_STACK_SIZE/sizeof (unsigned long)]; -} init_task_mem asm ("init_task") __attribute__((section(".data.init_task"))) = {{ +} init_task_mem asm ("init_task") __init_task_data = + {{ .task = INIT_TASK(init_task_mem.s.task), .thread_info = INIT_THREAD_INFO(init_task_mem.s.task) }}; diff --git a/arch/m32r/kernel/init_task.c b/arch/m32r/kernel/init_task.c index fce57e5d3f91..6c42d5f8df50 100644 --- a/arch/m32r/kernel/init_task.c +++ b/arch/m32r/kernel/init_task.c @@ -20,9 +20,8 @@ static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand); * way process stacks are handled. This is done by having a special * "init_task" linker map entry.. */ -union thread_union init_thread_union - __attribute__((__section__(".data.init_task"))) = - { INIT_THREAD_INFO(init_task) }; +union thread_union init_thread_union __init_task_data = + { INIT_THREAD_INFO(init_task) }; /* * Initial task structure. diff --git a/arch/m68k/kernel/process.c b/arch/m68k/kernel/process.c index 72bad65dba3a..41230c595a8e 100644 --- a/arch/m68k/kernel/process.c +++ b/arch/m68k/kernel/process.c @@ -42,9 +42,9 @@ */ static struct signal_struct init_signals = INIT_SIGNALS(init_signals); static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand); -union thread_union init_thread_union -__attribute__((section(".data.init_task"), aligned(THREAD_SIZE))) - = { INIT_THREAD_INFO(init_task) }; +union thread_union init_thread_union __init_task_data + __attribute__((aligned(THREAD_SIZE))) = + { INIT_THREAD_INFO(init_task) }; /* initial task structure */ struct task_struct init_task = INIT_TASK(init_task); diff --git a/arch/m68knommu/kernel/init_task.c b/arch/m68knommu/kernel/init_task.c index 45e97a207fed..cbf9dc3cc51d 100644 --- a/arch/m68knommu/kernel/init_task.c +++ b/arch/m68knommu/kernel/init_task.c @@ -31,7 +31,6 @@ EXPORT_SYMBOL(init_task); * way process stacks are handled. This is done by having a special * "init_task" linker map entry.. */ -union thread_union init_thread_union - __attribute__((__section__(".data.init_task"))) = - { INIT_THREAD_INFO(init_task) }; +union thread_union init_thread_union __init_task_data = + { INIT_THREAD_INFO(init_task) }; diff --git a/arch/microblaze/kernel/init_task.c b/arch/microblaze/kernel/init_task.c index 67da22579b62..b5d711f94ff8 100644 --- a/arch/microblaze/kernel/init_task.c +++ b/arch/microblaze/kernel/init_task.c @@ -19,9 +19,8 @@ static struct signal_struct init_signals = INIT_SIGNALS(init_signals); static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand); -union thread_union init_thread_union - __attribute__((__section__(".data.init_task"))) = -{ INIT_THREAD_INFO(init_task) }; +union thread_union init_thread_union __init_task_data = + { INIT_THREAD_INFO(init_task) }; struct task_struct init_task = INIT_TASK(init_task); EXPORT_SYMBOL(init_task); diff --git a/arch/mips/kernel/init_task.c b/arch/mips/kernel/init_task.c index 5b457a40c784..6d6ca5305895 100644 --- a/arch/mips/kernel/init_task.c +++ b/arch/mips/kernel/init_task.c @@ -21,9 +21,8 @@ static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand); * * The things we do for performance.. */ -union thread_union init_thread_union - __attribute__((__section__(".data.init_task"), - __aligned__(THREAD_SIZE))) = +union thread_union init_thread_union __init_task_data + __attribute__((__aligned__(THREAD_SIZE))) = { INIT_THREAD_INFO(init_task) }; /* diff --git a/arch/mn10300/kernel/init_task.c b/arch/mn10300/kernel/init_task.c index 80d423b80af3..a481b043bea7 100644 --- a/arch/mn10300/kernel/init_task.c +++ b/arch/mn10300/kernel/init_task.c @@ -27,9 +27,8 @@ static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand); * way process stacks are handled. This is done by having a special * "init_task" linker map entry.. */ -union thread_union init_thread_union - __attribute__((__section__(".data.init_task"))) = - { INIT_THREAD_INFO(init_task) }; +union thread_union init_thread_union __init_task_data = + { INIT_THREAD_INFO(init_task) }; /* * Initial task structure. diff --git a/arch/parisc/kernel/init_task.c b/arch/parisc/kernel/init_task.c index 82974b20fc10..d020eae6525c 100644 --- a/arch/parisc/kernel/init_task.c +++ b/arch/parisc/kernel/init_task.c @@ -43,8 +43,8 @@ static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand); * way process stacks are handled. This is done by having a special * "init_task" linker map entry.. */ -union thread_union init_thread_union - __attribute__((aligned(128))) __attribute__((__section__(".data.init_task"))) = +union thread_union init_thread_union __init_task_data + __attribute__((aligned(128))) = { INIT_THREAD_INFO(init_task) }; #if PT_NLEVELS == 3 diff --git a/arch/powerpc/kernel/init_task.c b/arch/powerpc/kernel/init_task.c index ffc4253fef55..2375b7eb1c76 100644 --- a/arch/powerpc/kernel/init_task.c +++ b/arch/powerpc/kernel/init_task.c @@ -16,9 +16,8 @@ static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand); * way process stacks are handled. This is done by having a special * "init_task" linker map entry.. */ -union thread_union init_thread_union - __attribute__((__section__(".data.init_task"))) = - { INIT_THREAD_INFO(init_task) }; +union thread_union init_thread_union __init_task_data = + { INIT_THREAD_INFO(init_task) }; /* * Initial task structure. diff --git a/arch/powerpc/kernel/machine_kexec_64.c b/arch/powerpc/kernel/machine_kexec_64.c index 49e705fcee6d..040bd1de8d99 100644 --- a/arch/powerpc/kernel/machine_kexec_64.c +++ b/arch/powerpc/kernel/machine_kexec_64.c @@ -13,6 +13,7 @@ #include #include #include +#include #include #include @@ -249,8 +250,8 @@ static void kexec_prepare_cpus(void) * We could use a smaller stack if we don't care about anything using * current, but that audit has not been performed. */ -static union thread_union kexec_stack - __attribute__((__section__(".data.init_task"))) = { }; +static union thread_union kexec_stack __init_task_data = + { }; /* Our assembly helper, in kexec_stub.S */ extern NORET_TYPE void kexec_sequence(void *newstack, unsigned long start, diff --git a/arch/s390/kernel/init_task.c b/arch/s390/kernel/init_task.c index fe787f9e5f3f..4d1c9fb0b540 100644 --- a/arch/s390/kernel/init_task.c +++ b/arch/s390/kernel/init_task.c @@ -25,9 +25,8 @@ static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand); * way process stacks are handled. This is done by having a special * "init_task" linker map entry.. */ -union thread_union init_thread_union - __attribute__((__section__(".data.init_task"))) = - { INIT_THREAD_INFO(init_task) }; +union thread_union init_thread_union __init_task_data = + { INIT_THREAD_INFO(init_task) }; /* * Initial task structure. diff --git a/arch/score/kernel/init_task.c b/arch/score/kernel/init_task.c index ff952f6c63fd..baa03ee217d1 100644 --- a/arch/score/kernel/init_task.c +++ b/arch/score/kernel/init_task.c @@ -34,9 +34,8 @@ static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand); * way process stacks are handled. This is done by having a special * "init_task" linker map entry.. */ -union thread_union init_thread_union - __attribute__((__section__(".data.init_task"), __aligned__(THREAD_SIZE))) = - { INIT_THREAD_INFO(init_task) }; +union thread_union init_thread_union __init_task_data = + { INIT_THREAD_INFO(init_task) }; /* * Initial task structure. diff --git a/arch/sh/kernel/init_task.c b/arch/sh/kernel/init_task.c index 1719957c0a69..11f2ea556a6b 100644 --- a/arch/sh/kernel/init_task.c +++ b/arch/sh/kernel/init_task.c @@ -17,9 +17,8 @@ struct pt_regs fake_swapper_regs; * way process stacks are handled. This is done by having a special * "init_task" linker map entry.. */ -union thread_union init_thread_union - __attribute__((__section__(".data.init_task"))) = - { INIT_THREAD_INFO(init_task) }; +union thread_union init_thread_union __init_task_data = + { INIT_THREAD_INFO(init_task) }; /* * Initial task structure. diff --git a/arch/sparc/kernel/init_task.c b/arch/sparc/kernel/init_task.c index 28125c5b3d3c..5fe3d65581f7 100644 --- a/arch/sparc/kernel/init_task.c +++ b/arch/sparc/kernel/init_task.c @@ -18,6 +18,5 @@ EXPORT_SYMBOL(init_task); * If this is not aligned on a 8k boundry, then you should change code * in etrap.S which assumes it. */ -union thread_union init_thread_union - __attribute__((section (".data.init_task"))) - = { INIT_THREAD_INFO(init_task) }; +union thread_union init_thread_union __init_task_data = + { INIT_THREAD_INFO(init_task) }; diff --git a/arch/um/kernel/init_task.c b/arch/um/kernel/init_task.c index b25121b537d8..8aa77b61a5ff 100644 --- a/arch/um/kernel/init_task.c +++ b/arch/um/kernel/init_task.c @@ -30,9 +30,8 @@ EXPORT_SYMBOL(init_task); * "init_task" linker map entry.. */ -union thread_union init_thread_union - __attribute__((__section__(".data.init_task"))) = - { INIT_THREAD_INFO(init_task) }; +union thread_union init_thread_union __init_task_data = + { INIT_THREAD_INFO(init_task) }; union thread_union cpu0_irqstack __attribute__((__section__(".data.init_irqstack"))) = diff --git a/arch/x86/kernel/init_task.c b/arch/x86/kernel/init_task.c index 270ff83efc11..3a54dcb9cd0e 100644 --- a/arch/x86/kernel/init_task.c +++ b/arch/x86/kernel/init_task.c @@ -20,9 +20,8 @@ static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand); * way process stacks are handled. This is done by having a special * "init_task" linker map entry.. */ -union thread_union init_thread_union - __attribute__((__section__(".data.init_task"))) = - { INIT_THREAD_INFO(init_task) }; +union thread_union init_thread_union __init_task_data = + { INIT_THREAD_INFO(init_task) }; /* * Initial task structure. diff --git a/arch/xtensa/kernel/init_task.c b/arch/xtensa/kernel/init_task.c index c4302f0e4ba0..cd122fb7e48a 100644 --- a/arch/xtensa/kernel/init_task.c +++ b/arch/xtensa/kernel/init_task.c @@ -23,9 +23,8 @@ static struct signal_struct init_signals = INIT_SIGNALS(init_signals); static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand); -union thread_union init_thread_union - __attribute__((__section__(".data.init_task"))) = -{ INIT_THREAD_INFO(init_task) }; +union thread_union init_thread_union __init_task_data = + { INIT_THREAD_INFO(init_task) }; struct task_struct init_task = INIT_TASK(init_task); -- cgit v1.2.3 From 02b7da37f7acd49277dea1481dc0c5c246c09732 Mon Sep 17 00:00:00 2001 From: Tim Abbott Date: Sun, 20 Sep 2009 18:14:14 -0400 Subject: Use macros for .bss.page_aligned section. This patch changes the remaining direct references to .bss.page_aligned in C and assembly code to use the macros in include/linux/linkage.h. Signed-off-by: Tim Abbott Cc: Thomas Gleixner Cc: Ingo Molnar Cc: H. Peter Anvin Acked-by: Paul Mundt Cc: Chris Zankel Signed-off-by: Sam Ravnborg --- arch/sh/kernel/irq.c | 6 ++---- arch/x86/kernel/head_32.S | 2 +- arch/x86/kernel/head_64.S | 2 +- arch/xtensa/kernel/head.S | 2 +- 4 files changed, 5 insertions(+), 7 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/sh/kernel/irq.c b/arch/sh/kernel/irq.c index 60f8af4497c7..7cb933ba4957 100644 --- a/arch/sh/kernel/irq.c +++ b/arch/sh/kernel/irq.c @@ -165,11 +165,9 @@ asmlinkage int do_IRQ(unsigned int irq, struct pt_regs *regs) } #ifdef CONFIG_IRQSTACKS -static char softirq_stack[NR_CPUS * THREAD_SIZE] - __attribute__((__section__(".bss.page_aligned"))); +static char softirq_stack[NR_CPUS * THREAD_SIZE] __page_aligned_bss; -static char hardirq_stack[NR_CPUS * THREAD_SIZE] - __attribute__((__section__(".bss.page_aligned"))); +static char hardirq_stack[NR_CPUS * THREAD_SIZE] __page_aligned_bss; /* * allocate per-cpu stacks for hardirq and for softirq processing diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S index b766e8c7252d..1dac23958427 100644 --- a/arch/x86/kernel/head_32.S +++ b/arch/x86/kernel/head_32.S @@ -608,7 +608,7 @@ ENTRY(initial_code) /* * BSS section */ -.section ".bss.page_aligned","wa" +__PAGE_ALIGNED_BSS .align PAGE_SIZE_asm #ifdef CONFIG_X86_PAE swapper_pg_pmd: diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index fa54f78e2a05..d0bc0a13a437 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -418,7 +418,7 @@ ENTRY(phys_base) ENTRY(idt_table) .skip IDT_ENTRIES * 16 - .section .bss.page_aligned, "aw", @nobits + __PAGE_ALIGNED_BSS .align PAGE_SIZE ENTRY(empty_zero_page) .skip PAGE_SIZE diff --git a/arch/xtensa/kernel/head.S b/arch/xtensa/kernel/head.S index d9ddc1ba761c..d215adcfd4ea 100644 --- a/arch/xtensa/kernel/head.S +++ b/arch/xtensa/kernel/head.S @@ -235,7 +235,7 @@ should_never_return: * BSS section */ -.section ".bss.page_aligned", "w" +__PAGE_ALIGNED_BSS #ifdef CONFIG_MMU ENTRY(swapper_pg_dir) .fill PAGE_SIZE, 1, 0 -- cgit v1.2.3 From abe1ee3a221d53778c3e58747bbec6e518e5471b Mon Sep 17 00:00:00 2001 From: Tim Abbott Date: Sun, 20 Sep 2009 18:14:15 -0400 Subject: Use macros for .data.page_aligned section. This patch changes the remaining direct references to .data.page_aligned in C and assembly code to use the macros in include/linux/linkage.h. Signed-off-by: Tim Abbott Cc: Thomas Gleixner Cc: Ingo Molnar Cc: H. Peter Anvin Cc: Haavard Skinnemoen Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Martin Schwidefsky Signed-off-by: Sam Ravnborg --- arch/avr32/mm/init.c | 4 +--- arch/powerpc/kernel/vdso.c | 3 ++- arch/powerpc/kernel/vdso32/vdso32_wrapper.S | 3 ++- arch/powerpc/kernel/vdso64/vdso64_wrapper.S | 3 ++- arch/s390/kernel/vdso.c | 2 +- arch/s390/kernel/vdso32/vdso32_wrapper.S | 3 ++- arch/s390/kernel/vdso64/vdso64_wrapper.S | 3 ++- arch/x86/include/asm/cache.h | 4 +++- arch/x86/kernel/head_32.S | 2 +- 9 files changed, 16 insertions(+), 11 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/avr32/mm/init.c b/arch/avr32/mm/init.c index e819fa69a90e..cc60d10cf8f7 100644 --- a/arch/avr32/mm/init.c +++ b/arch/avr32/mm/init.c @@ -24,11 +24,9 @@ #include #include -#define __page_aligned __attribute__((section(".data.page_aligned"))) - DEFINE_PER_CPU(struct mmu_gather, mmu_gathers); -pgd_t swapper_pg_dir[PTRS_PER_PGD] __page_aligned; +pgd_t swapper_pg_dir[PTRS_PER_PGD] __page_aligned_data; struct page *empty_zero_page; EXPORT_SYMBOL(empty_zero_page); diff --git a/arch/powerpc/kernel/vdso.c b/arch/powerpc/kernel/vdso.c index a0abce251d0a..3faaf29bdb29 100644 --- a/arch/powerpc/kernel/vdso.c +++ b/arch/powerpc/kernel/vdso.c @@ -1,3 +1,4 @@ + /* * Copyright (C) 2004 Benjamin Herrenschmidt, IBM Corp. * @@ -74,7 +75,7 @@ static int vdso_ready; static union { struct vdso_data data; u8 page[PAGE_SIZE]; -} vdso_data_store __attribute__((__section__(".data.page_aligned"))); +} vdso_data_store __page_aligned_data; struct vdso_data *vdso_data = &vdso_data_store.data; /* Format of the patch table */ diff --git a/arch/powerpc/kernel/vdso32/vdso32_wrapper.S b/arch/powerpc/kernel/vdso32/vdso32_wrapper.S index 556f0caa5d84..6e8f507ed32b 100644 --- a/arch/powerpc/kernel/vdso32/vdso32_wrapper.S +++ b/arch/powerpc/kernel/vdso32/vdso32_wrapper.S @@ -1,7 +1,8 @@ #include +#include #include - .section ".data.page_aligned" + __PAGE_ALIGNED_DATA .globl vdso32_start, vdso32_end .balign PAGE_SIZE diff --git a/arch/powerpc/kernel/vdso64/vdso64_wrapper.S b/arch/powerpc/kernel/vdso64/vdso64_wrapper.S index 0529cb9e3b97..b8553d62b792 100644 --- a/arch/powerpc/kernel/vdso64/vdso64_wrapper.S +++ b/arch/powerpc/kernel/vdso64/vdso64_wrapper.S @@ -1,7 +1,8 @@ #include +#include #include - .section ".data.page_aligned" + __PAGE_ALIGNED_DATA .globl vdso64_start, vdso64_end .balign PAGE_SIZE diff --git a/arch/s390/kernel/vdso.c b/arch/s390/kernel/vdso.c index 45e1708b70fd..45a3e9a7ae21 100644 --- a/arch/s390/kernel/vdso.c +++ b/arch/s390/kernel/vdso.c @@ -75,7 +75,7 @@ __setup("vdso=", vdso_setup); static union { struct vdso_data data; u8 page[PAGE_SIZE]; -} vdso_data_store __attribute__((__section__(".data.page_aligned"))); +} vdso_data_store __page_aligned_data; struct vdso_data *vdso_data = &vdso_data_store.data; /* diff --git a/arch/s390/kernel/vdso32/vdso32_wrapper.S b/arch/s390/kernel/vdso32/vdso32_wrapper.S index 61639a89e70b..ae42f8ce350b 100644 --- a/arch/s390/kernel/vdso32/vdso32_wrapper.S +++ b/arch/s390/kernel/vdso32/vdso32_wrapper.S @@ -1,7 +1,8 @@ #include +#include #include - .section ".data.page_aligned" + __PAGE_ALIGNED_DATA .globl vdso32_start, vdso32_end .balign PAGE_SIZE diff --git a/arch/s390/kernel/vdso64/vdso64_wrapper.S b/arch/s390/kernel/vdso64/vdso64_wrapper.S index d8e2ac14d564..c245842b516f 100644 --- a/arch/s390/kernel/vdso64/vdso64_wrapper.S +++ b/arch/s390/kernel/vdso64/vdso64_wrapper.S @@ -1,7 +1,8 @@ #include +#include #include - .section ".data.page_aligned" + __PAGE_ALIGNED_DATA .globl vdso64_start, vdso64_end .balign PAGE_SIZE diff --git a/arch/x86/include/asm/cache.h b/arch/x86/include/asm/cache.h index 5d367caa0e36..549860d3be8f 100644 --- a/arch/x86/include/asm/cache.h +++ b/arch/x86/include/asm/cache.h @@ -1,6 +1,8 @@ #ifndef _ASM_X86_CACHE_H #define _ASM_X86_CACHE_H +#include + /* L1 cache line size */ #define L1_CACHE_SHIFT (CONFIG_X86_L1_CACHE_SHIFT) #define L1_CACHE_BYTES (1 << L1_CACHE_SHIFT) @@ -13,7 +15,7 @@ #ifdef CONFIG_SMP #define __cacheline_aligned_in_smp \ __attribute__((__aligned__(1 << (INTERNODE_CACHE_SHIFT)))) \ - __attribute__((__section__(".data.page_aligned"))) + __page_aligned_data #endif #endif diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S index 1dac23958427..218aad7ee76e 100644 --- a/arch/x86/kernel/head_32.S +++ b/arch/x86/kernel/head_32.S @@ -626,7 +626,7 @@ ENTRY(empty_zero_page) * This starts the data section. */ #ifdef CONFIG_X86_PAE -.section ".data.page_aligned","wa" +__PAGE_ALIGNED_DATA /* Page-aligned for the benefit of paravirt? */ .align PAGE_SIZE_asm ENTRY(swapper_pg_dir) -- cgit v1.2.3 From dfc65094d0313cc48969fa60bcf33d693aeb05a7 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 21 Sep 2009 11:31:35 +0200 Subject: perf_counter: Rename 'event' to event_id/hw_event In preparation to the renames, to avoid a namespace clash. Cc: Peter Zijlstra Cc: Paul Mackerras Cc: Frederic Weisbecker LKML-Reference: Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 48 +++++++++++++++++++------------------- kernel/perf_counter.c | 26 ++++++++++----------- 2 files changed, 37 insertions(+), 37 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index a6c8b27553cd..b1f115696c84 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -124,9 +124,9 @@ static const u64 p6_perfmon_event_map[] = [PERF_COUNT_HW_BUS_CYCLES] = 0x0062, }; -static u64 p6_pmu_event_map(int event) +static u64 p6_pmu_event_map(int hw_event) { - return p6_perfmon_event_map[event]; + return p6_perfmon_event_map[hw_event]; } /* @@ -137,7 +137,7 @@ static u64 p6_pmu_event_map(int event) */ #define P6_NOP_COUNTER 0x0000002EULL -static u64 p6_pmu_raw_event(u64 event) +static u64 p6_pmu_raw_event(u64 hw_event) { #define P6_EVNTSEL_EVENT_MASK 0x000000FFULL #define P6_EVNTSEL_UNIT_MASK 0x0000FF00ULL @@ -152,7 +152,7 @@ static u64 p6_pmu_raw_event(u64 event) P6_EVNTSEL_INV_MASK | \ P6_EVNTSEL_COUNTER_MASK) - return event & P6_EVNTSEL_MASK; + return hw_event & P6_EVNTSEL_MASK; } @@ -170,16 +170,16 @@ static const u64 intel_perfmon_event_map[] = [PERF_COUNT_HW_BUS_CYCLES] = 0x013c, }; -static u64 intel_pmu_event_map(int event) +static u64 intel_pmu_event_map(int hw_event) { - return intel_perfmon_event_map[event]; + return intel_perfmon_event_map[hw_event]; } /* - * Generalized hw caching related event table, filled + * Generalized hw caching related hw_event table, filled * in on a per model basis. A value of 0 means - * 'not supported', -1 means 'event makes no sense on - * this CPU', any other value means the raw event + * 'not supported', -1 means 'hw_event makes no sense on + * this CPU', any other value means the raw hw_event * ID. */ @@ -463,7 +463,7 @@ static const u64 atom_hw_cache_event_ids }, }; -static u64 intel_pmu_raw_event(u64 event) +static u64 intel_pmu_raw_event(u64 hw_event) { #define CORE_EVNTSEL_EVENT_MASK 0x000000FFULL #define CORE_EVNTSEL_UNIT_MASK 0x0000FF00ULL @@ -478,7 +478,7 @@ static u64 intel_pmu_raw_event(u64 event) CORE_EVNTSEL_INV_MASK | \ CORE_EVNTSEL_COUNTER_MASK) - return event & CORE_EVNTSEL_MASK; + return hw_event & CORE_EVNTSEL_MASK; } static const u64 amd_hw_cache_event_ids @@ -585,12 +585,12 @@ static const u64 amd_perfmon_event_map[] = [PERF_COUNT_HW_BRANCH_MISSES] = 0x00c5, }; -static u64 amd_pmu_event_map(int event) +static u64 amd_pmu_event_map(int hw_event) { - return amd_perfmon_event_map[event]; + return amd_perfmon_event_map[hw_event]; } -static u64 amd_pmu_raw_event(u64 event) +static u64 amd_pmu_raw_event(u64 hw_event) { #define K7_EVNTSEL_EVENT_MASK 0x7000000FFULL #define K7_EVNTSEL_UNIT_MASK 0x00000FF00ULL @@ -605,7 +605,7 @@ static u64 amd_pmu_raw_event(u64 event) K7_EVNTSEL_INV_MASK | \ K7_EVNTSEL_COUNTER_MASK) - return event & K7_EVNTSEL_MASK; + return hw_event & K7_EVNTSEL_MASK; } /* @@ -956,7 +956,7 @@ static int __hw_perf_counter_init(struct perf_counter *counter) } /* - * Raw event type provide the config in the event structure + * Raw hw_event type provide the config in the hw_event structure */ if (attr->type == PERF_TYPE_RAW) { hwc->config |= x86_pmu.raw_event(attr->config); @@ -1245,7 +1245,7 @@ x86_perf_counter_set_period(struct perf_counter *counter, ret = 1; } /* - * Quirk: certain CPUs dont like it if just 1 event is left: + * Quirk: certain CPUs dont like it if just 1 hw_event is left: */ if (unlikely(left < 2)) left = 2; @@ -1337,11 +1337,11 @@ static void amd_pmu_enable_counter(struct hw_perf_counter *hwc, int idx) static int fixed_mode_idx(struct perf_counter *counter, struct hw_perf_counter *hwc) { - unsigned int event; + unsigned int hw_event; - event = hwc->config & ARCH_PERFMON_EVENT_MASK; + hw_event = hwc->config & ARCH_PERFMON_EVENT_MASK; - if (unlikely((event == + if (unlikely((hw_event == x86_pmu.event_map(PERF_COUNT_HW_BRANCH_INSTRUCTIONS)) && (hwc->sample_period == 1))) return X86_PMC_IDX_FIXED_BTS; @@ -1349,11 +1349,11 @@ fixed_mode_idx(struct perf_counter *counter, struct hw_perf_counter *hwc) if (!x86_pmu.num_counters_fixed) return -1; - if (unlikely(event == x86_pmu.event_map(PERF_COUNT_HW_INSTRUCTIONS))) + if (unlikely(hw_event == x86_pmu.event_map(PERF_COUNT_HW_INSTRUCTIONS))) return X86_PMC_IDX_FIXED_INSTRUCTIONS; - if (unlikely(event == x86_pmu.event_map(PERF_COUNT_HW_CPU_CYCLES))) + if (unlikely(hw_event == x86_pmu.event_map(PERF_COUNT_HW_CPU_CYCLES))) return X86_PMC_IDX_FIXED_CPU_CYCLES; - if (unlikely(event == x86_pmu.event_map(PERF_COUNT_HW_BUS_CYCLES))) + if (unlikely(hw_event == x86_pmu.event_map(PERF_COUNT_HW_BUS_CYCLES))) return X86_PMC_IDX_FIXED_BUS_CYCLES; return -1; @@ -1970,7 +1970,7 @@ static int intel_pmu_init(void) /* * Check whether the Architectural PerfMon supports - * Branch Misses Retired Event or not. + * Branch Misses Retired hw_event or not. */ cpuid(10, &eax.full, &ebx, &unused, &edx.full); if (eax.split.mask_length <= ARCH_PERFMON_BRANCH_MISSES_RETIRED) diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 13ad73aed4ca..62de0db8092b 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -3044,22 +3044,22 @@ perf_counter_read_event(struct perf_counter *counter, struct task_struct *task) { struct perf_output_handle handle; - struct perf_read_event event = { + struct perf_read_event read_event = { .header = { .type = PERF_EVENT_READ, .misc = 0, - .size = sizeof(event) + perf_counter_read_size(counter), + .size = sizeof(read_event) + perf_counter_read_size(counter), }, .pid = perf_counter_pid(counter, task), .tid = perf_counter_tid(counter, task), }; int ret; - ret = perf_output_begin(&handle, counter, event.header.size, 0, 0); + ret = perf_output_begin(&handle, counter, read_event.header.size, 0, 0); if (ret) return; - perf_output_put(&handle, event); + perf_output_put(&handle, read_event); perf_output_read(&handle, counter); perf_output_end(&handle); @@ -3698,14 +3698,14 @@ static int perf_swcounter_is_counting(struct perf_counter *counter) static int perf_swcounter_match(struct perf_counter *counter, enum perf_type_id type, - u32 event, struct pt_regs *regs) + u32 event_id, struct pt_regs *regs) { if (!perf_swcounter_is_counting(counter)) return 0; if (counter->attr.type != type) return 0; - if (counter->attr.config != event) + if (counter->attr.config != event_id) return 0; if (regs) { @@ -3721,7 +3721,7 @@ static int perf_swcounter_match(struct perf_counter *counter, static void perf_swcounter_ctx_event(struct perf_counter_context *ctx, enum perf_type_id type, - u32 event, u64 nr, int nmi, + u32 event_id, u64 nr, int nmi, struct perf_sample_data *data, struct pt_regs *regs) { @@ -3732,7 +3732,7 @@ static void perf_swcounter_ctx_event(struct perf_counter_context *ctx, rcu_read_lock(); list_for_each_entry_rcu(counter, &ctx->event_list, event_entry) { - if (perf_swcounter_match(counter, type, event, regs)) + if (perf_swcounter_match(counter, type, event_id, regs)) perf_swcounter_add(counter, nr, nmi, data, regs); } rcu_read_unlock(); @@ -4036,17 +4036,17 @@ atomic_t perf_swcounter_enabled[PERF_COUNT_SW_MAX]; static void sw_perf_counter_destroy(struct perf_counter *counter) { - u64 event = counter->attr.config; + u64 event_id = counter->attr.config; WARN_ON(counter->parent); - atomic_dec(&perf_swcounter_enabled[event]); + atomic_dec(&perf_swcounter_enabled[event_id]); } static const struct pmu *sw_perf_counter_init(struct perf_counter *counter) { const struct pmu *pmu = NULL; - u64 event = counter->attr.config; + u64 event_id = counter->attr.config; /* * Software counters (currently) can't in general distinguish @@ -4055,7 +4055,7 @@ static const struct pmu *sw_perf_counter_init(struct perf_counter *counter) * to be kernel events, and page faults are never hypervisor * events. */ - switch (event) { + switch (event_id) { case PERF_COUNT_SW_CPU_CLOCK: pmu = &perf_ops_cpu_clock; @@ -4077,7 +4077,7 @@ static const struct pmu *sw_perf_counter_init(struct perf_counter *counter) case PERF_COUNT_SW_CONTEXT_SWITCHES: case PERF_COUNT_SW_CPU_MIGRATIONS: if (!counter->parent) { - atomic_inc(&perf_swcounter_enabled[event]); + atomic_inc(&perf_swcounter_enabled[event_id]); counter->destroy = sw_perf_counter_destroy; } pmu = &perf_ops_generic; -- cgit v1.2.3 From cdd6c482c9ff9c55475ee7392ec8f672eddb7be6 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 21 Sep 2009 12:02:48 +0200 Subject: perf: Do the big rename: Performance Counters -> Performance Events Bye-bye Performance Counters, welcome Performance Events! In the past few months the perfcounters subsystem has grown out its initial role of counting hardware events, and has become (and is becoming) a much broader generic event enumeration, reporting, logging, monitoring, analysis facility. Naming its core object 'perf_counter' and naming the subsystem 'perfcounters' has become more and more of a misnomer. With pending code like hw-breakpoints support the 'counter' name is less and less appropriate. All in one, we've decided to rename the subsystem to 'performance events' and to propagate this rename through all fields, variables and API names. (in an ABI compatible fashion) The word 'event' is also a bit shorter than 'counter' - which makes it slightly more convenient to write/handle as well. Thanks goes to Stephane Eranian who first observed this misnomer and suggested a rename. User-space tooling and ABI compatibility is not affected - this patch should be function-invariant. (Also, defconfigs were not touched to keep the size down.) This patch has been generated via the following script: FILES=$(find * -type f | grep -vE 'oprofile|[^K]config') sed -i \ -e 's/PERF_EVENT_/PERF_RECORD_/g' \ -e 's/PERF_COUNTER/PERF_EVENT/g' \ -e 's/perf_counter/perf_event/g' \ -e 's/nb_counters/nb_events/g' \ -e 's/swcounter/swevent/g' \ -e 's/tpcounter_event/tp_event/g' \ $FILES for N in $(find . -name perf_counter.[ch]); do M=$(echo $N | sed 's/perf_counter/perf_event/g') mv $N $M done FILES=$(find . -name perf_event.*) sed -i \ -e 's/COUNTER_MASK/REG_MASK/g' \ -e 's/COUNTER/EVENT/g' \ -e 's/\/event_id/g' \ -e 's/counter/event/g' \ -e 's/Counter/Event/g' \ $FILES ... to keep it as correct as possible. This script can also be used by anyone who has pending perfcounters patches - it converts a Linux kernel tree over to the new naming. We tried to time this change to the point in time where the amount of pending patches is the smallest: the end of the merge window. Namespace clashes were fixed up in a preparatory patch - and some stylistic fallout will be fixed up in a subsequent patch. ( NOTE: 'counters' are still the proper terminology when we deal with hardware registers - and these sed scripts are a bit over-eager in renaming them. I've undone some of that, but in case there's something left where 'counter' would be better than 'event' we can undo that on an individual basis instead of touching an otherwise nicely automated patch. ) Suggested-by: Stephane Eranian Acked-by: Peter Zijlstra Acked-by: Paul Mackerras Reviewed-by: Arjan van de Ven Cc: Mike Galbraith Cc: Arnaldo Carvalho de Melo Cc: Frederic Weisbecker Cc: Steven Rostedt Cc: Benjamin Herrenschmidt Cc: David Howells Cc: Kyle McMartin Cc: Martin Schwidefsky Cc: "David S. Miller" Cc: Thomas Gleixner Cc: "H. Peter Anvin" Cc: LKML-Reference: Signed-off-by: Ingo Molnar --- arch/arm/include/asm/unistd.h | 2 +- arch/arm/kernel/calls.S | 2 +- arch/blackfin/include/asm/unistd.h | 2 +- arch/blackfin/mach-common/entry.S | 2 +- arch/frv/Kconfig | 2 +- arch/frv/include/asm/perf_counter.h | 17 - arch/frv/include/asm/perf_event.h | 17 + arch/frv/include/asm/unistd.h | 2 +- arch/frv/kernel/entry.S | 2 +- arch/frv/lib/Makefile | 2 +- arch/frv/lib/perf_counter.c | 19 - arch/frv/lib/perf_event.c | 19 + arch/m68k/include/asm/unistd.h | 2 +- arch/m68k/kernel/entry.S | 2 +- arch/m68knommu/kernel/syscalltable.S | 2 +- arch/microblaze/include/asm/unistd.h | 2 +- arch/microblaze/kernel/syscall_table.S | 2 +- arch/mips/include/asm/unistd.h | 6 +- arch/mips/kernel/scall32-o32.S | 2 +- arch/mips/kernel/scall64-64.S | 2 +- arch/mips/kernel/scall64-n32.S | 2 +- arch/mips/kernel/scall64-o32.S | 2 +- arch/mn10300/include/asm/unistd.h | 2 +- arch/mn10300/kernel/entry.S | 2 +- arch/parisc/Kconfig | 2 +- arch/parisc/include/asm/perf_counter.h | 7 - arch/parisc/include/asm/perf_event.h | 7 + arch/parisc/include/asm/unistd.h | 4 +- arch/parisc/kernel/syscall_table.S | 2 +- arch/powerpc/Kconfig | 2 +- arch/powerpc/include/asm/hw_irq.h | 22 +- arch/powerpc/include/asm/paca.h | 2 +- arch/powerpc/include/asm/perf_counter.h | 110 - arch/powerpc/include/asm/perf_event.h | 110 + arch/powerpc/include/asm/systbl.h | 2 +- arch/powerpc/include/asm/unistd.h | 2 +- arch/powerpc/kernel/Makefile | 2 +- arch/powerpc/kernel/asm-offsets.c | 2 +- arch/powerpc/kernel/entry_64.S | 8 +- arch/powerpc/kernel/irq.c | 8 +- arch/powerpc/kernel/mpc7450-pmu.c | 2 +- arch/powerpc/kernel/perf_callchain.c | 2 +- arch/powerpc/kernel/perf_counter.c | 1315 -------- arch/powerpc/kernel/perf_event.c | 1315 ++++++++ arch/powerpc/kernel/power4-pmu.c | 2 +- arch/powerpc/kernel/power5+-pmu.c | 2 +- arch/powerpc/kernel/power5-pmu.c | 2 +- arch/powerpc/kernel/power6-pmu.c | 2 +- arch/powerpc/kernel/power7-pmu.c | 2 +- arch/powerpc/kernel/ppc970-pmu.c | 2 +- arch/powerpc/kernel/time.c | 30 +- arch/powerpc/mm/fault.c | 8 +- arch/powerpc/platforms/Kconfig.cputype | 4 +- arch/s390/Kconfig | 2 +- arch/s390/include/asm/perf_counter.h | 10 - arch/s390/include/asm/perf_event.h | 10 + arch/s390/include/asm/unistd.h | 2 +- arch/s390/kernel/compat_wrapper.S | 8 +- arch/s390/kernel/syscalls.S | 2 +- arch/s390/mm/fault.c | 8 +- arch/sh/Kconfig | 2 +- arch/sh/include/asm/perf_counter.h | 9 - arch/sh/include/asm/perf_event.h | 9 + arch/sh/include/asm/unistd_32.h | 2 +- arch/sh/include/asm/unistd_64.h | 2 +- arch/sh/kernel/syscalls_32.S | 2 +- arch/sh/kernel/syscalls_64.S | 2 +- arch/sh/mm/fault_32.c | 8 +- arch/sh/mm/tlbflush_64.c | 8 +- arch/sparc/Kconfig | 4 +- arch/sparc/include/asm/perf_counter.h | 14 - arch/sparc/include/asm/perf_event.h | 14 + arch/sparc/include/asm/unistd.h | 2 +- arch/sparc/kernel/Makefile | 2 +- arch/sparc/kernel/nmi.c | 4 +- arch/sparc/kernel/pcr.c | 10 +- arch/sparc/kernel/perf_counter.c | 556 ---- arch/sparc/kernel/perf_event.c | 556 ++++ arch/sparc/kernel/systbls_32.S | 2 +- arch/sparc/kernel/systbls_64.S | 4 +- arch/x86/Kconfig | 2 +- arch/x86/ia32/ia32entry.S | 2 +- arch/x86/include/asm/entry_arch.h | 2 +- arch/x86/include/asm/perf_counter.h | 108 - arch/x86/include/asm/perf_event.h | 108 + arch/x86/include/asm/unistd_32.h | 2 +- arch/x86/include/asm/unistd_64.h | 4 +- arch/x86/kernel/apic/apic.c | 6 +- arch/x86/kernel/cpu/Makefile | 2 +- arch/x86/kernel/cpu/common.c | 4 +- arch/x86/kernel/cpu/perf_counter.c | 2298 -------------- arch/x86/kernel/cpu/perf_event.c | 2298 ++++++++++++++ arch/x86/kernel/cpu/perfctr-watchdog.c | 2 +- arch/x86/kernel/entry_64.S | 2 +- arch/x86/kernel/irqinit.c | 2 +- arch/x86/kernel/syscall_table_32.S | 2 +- arch/x86/mm/fault.c | 8 +- arch/x86/oprofile/op_model_ppro.c | 4 +- arch/x86/oprofile/op_x86_model.h | 2 +- drivers/char/sysrq.c | 4 +- fs/exec.c | 6 +- include/asm-generic/unistd.h | 4 +- include/linux/init_task.h | 14 +- include/linux/perf_counter.h | 858 ------ include/linux/perf_event.h | 858 ++++++ include/linux/prctl.h | 4 +- include/linux/sched.h | 12 +- include/linux/syscalls.h | 6 +- include/trace/ftrace.h | 10 +- init/Kconfig | 8 +- kernel/Makefile | 2 +- kernel/exit.c | 8 +- kernel/fork.c | 8 +- kernel/perf_counter.c | 5000 ------------------------------- kernel/perf_event.c | 5000 +++++++++++++++++++++++++++++++ kernel/sched.c | 14 +- kernel/sys.c | 10 +- kernel/sys_ni.c | 2 +- kernel/sysctl.c | 22 +- kernel/timer.c | 4 +- kernel/trace/trace_syscalls.c | 6 +- mm/mmap.c | 6 +- mm/mprotect.c | 4 +- tools/perf/Makefile | 2 +- tools/perf/builtin-annotate.c | 28 +- tools/perf/builtin-record.c | 22 +- tools/perf/builtin-report.c | 48 +- tools/perf/builtin-sched.c | 20 +- tools/perf/builtin-stat.c | 10 +- tools/perf/builtin-timechart.c | 14 +- tools/perf/builtin-top.c | 12 +- tools/perf/builtin-trace.c | 22 +- tools/perf/design.txt | 58 +- tools/perf/perf.h | 12 +- tools/perf/util/event.h | 4 +- tools/perf/util/header.c | 6 +- tools/perf/util/header.h | 8 +- tools/perf/util/parse-events.c | 32 +- tools/perf/util/parse-events.h | 2 +- tools/perf/util/trace-event-info.c | 8 +- tools/perf/util/trace-event.h | 2 +- 141 files changed, 10694 insertions(+), 10694 deletions(-) delete mode 100644 arch/frv/include/asm/perf_counter.h create mode 100644 arch/frv/include/asm/perf_event.h delete mode 100644 arch/frv/lib/perf_counter.c create mode 100644 arch/frv/lib/perf_event.c delete mode 100644 arch/parisc/include/asm/perf_counter.h create mode 100644 arch/parisc/include/asm/perf_event.h delete mode 100644 arch/powerpc/include/asm/perf_counter.h create mode 100644 arch/powerpc/include/asm/perf_event.h delete mode 100644 arch/powerpc/kernel/perf_counter.c create mode 100644 arch/powerpc/kernel/perf_event.c delete mode 100644 arch/s390/include/asm/perf_counter.h create mode 100644 arch/s390/include/asm/perf_event.h delete mode 100644 arch/sh/include/asm/perf_counter.h create mode 100644 arch/sh/include/asm/perf_event.h delete mode 100644 arch/sparc/include/asm/perf_counter.h create mode 100644 arch/sparc/include/asm/perf_event.h delete mode 100644 arch/sparc/kernel/perf_counter.c create mode 100644 arch/sparc/kernel/perf_event.c delete mode 100644 arch/x86/include/asm/perf_counter.h create mode 100644 arch/x86/include/asm/perf_event.h delete mode 100644 arch/x86/kernel/cpu/perf_counter.c create mode 100644 arch/x86/kernel/cpu/perf_event.c delete mode 100644 include/linux/perf_counter.h create mode 100644 include/linux/perf_event.h delete mode 100644 kernel/perf_counter.c create mode 100644 kernel/perf_event.c (limited to 'arch/x86/kernel') diff --git a/arch/arm/include/asm/unistd.h b/arch/arm/include/asm/unistd.h index 9122c9ee18fb..89f7eade20af 100644 --- a/arch/arm/include/asm/unistd.h +++ b/arch/arm/include/asm/unistd.h @@ -390,7 +390,7 @@ #define __NR_preadv (__NR_SYSCALL_BASE+361) #define __NR_pwritev (__NR_SYSCALL_BASE+362) #define __NR_rt_tgsigqueueinfo (__NR_SYSCALL_BASE+363) -#define __NR_perf_counter_open (__NR_SYSCALL_BASE+364) +#define __NR_perf_event_open (__NR_SYSCALL_BASE+364) /* * The following SWIs are ARM private. diff --git a/arch/arm/kernel/calls.S b/arch/arm/kernel/calls.S index ecfa98954d1d..fafce1b5c69f 100644 --- a/arch/arm/kernel/calls.S +++ b/arch/arm/kernel/calls.S @@ -373,7 +373,7 @@ CALL(sys_preadv) CALL(sys_pwritev) CALL(sys_rt_tgsigqueueinfo) - CALL(sys_perf_counter_open) + CALL(sys_perf_event_open) #ifndef syscalls_counted .equ syscalls_padding, ((NR_syscalls + 3) & ~3) - NR_syscalls #define syscalls_counted diff --git a/arch/blackfin/include/asm/unistd.h b/arch/blackfin/include/asm/unistd.h index c8e7ee4768cd..02b1529dad57 100644 --- a/arch/blackfin/include/asm/unistd.h +++ b/arch/blackfin/include/asm/unistd.h @@ -381,7 +381,7 @@ #define __NR_preadv 366 #define __NR_pwritev 367 #define __NR_rt_tgsigqueueinfo 368 -#define __NR_perf_counter_open 369 +#define __NR_perf_event_open 369 #define __NR_syscall 370 #define NR_syscalls __NR_syscall diff --git a/arch/blackfin/mach-common/entry.S b/arch/blackfin/mach-common/entry.S index 01af24cde362..1e7cac23e25f 100644 --- a/arch/blackfin/mach-common/entry.S +++ b/arch/blackfin/mach-common/entry.S @@ -1620,7 +1620,7 @@ ENTRY(_sys_call_table) .long _sys_preadv .long _sys_pwritev .long _sys_rt_tgsigqueueinfo - .long _sys_perf_counter_open + .long _sys_perf_event_open .rept NR_syscalls-(.-_sys_call_table)/4 .long _sys_ni_syscall diff --git a/arch/frv/Kconfig b/arch/frv/Kconfig index b86e19c9b5b0..4b5830bcbe2e 100644 --- a/arch/frv/Kconfig +++ b/arch/frv/Kconfig @@ -7,7 +7,7 @@ config FRV default y select HAVE_IDE select HAVE_ARCH_TRACEHOOK - select HAVE_PERF_COUNTERS + select HAVE_PERF_EVENTS config ZONE_DMA bool diff --git a/arch/frv/include/asm/perf_counter.h b/arch/frv/include/asm/perf_counter.h deleted file mode 100644 index ccf726e61b2e..000000000000 --- a/arch/frv/include/asm/perf_counter.h +++ /dev/null @@ -1,17 +0,0 @@ -/* FRV performance counter support - * - * Copyright (C) 2009 Red Hat, Inc. All Rights Reserved. - * Written by David Howells (dhowells@redhat.com) - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public Licence - * as published by the Free Software Foundation; either version - * 2 of the Licence, or (at your option) any later version. - */ - -#ifndef _ASM_PERF_COUNTER_H -#define _ASM_PERF_COUNTER_H - -#define PERF_COUNTER_INDEX_OFFSET 0 - -#endif /* _ASM_PERF_COUNTER_H */ diff --git a/arch/frv/include/asm/perf_event.h b/arch/frv/include/asm/perf_event.h new file mode 100644 index 000000000000..a69e0155d146 --- /dev/null +++ b/arch/frv/include/asm/perf_event.h @@ -0,0 +1,17 @@ +/* FRV performance event support + * + * Copyright (C) 2009 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public Licence + * as published by the Free Software Foundation; either version + * 2 of the Licence, or (at your option) any later version. + */ + +#ifndef _ASM_PERF_EVENT_H +#define _ASM_PERF_EVENT_H + +#define PERF_EVENT_INDEX_OFFSET 0 + +#endif /* _ASM_PERF_EVENT_H */ diff --git a/arch/frv/include/asm/unistd.h b/arch/frv/include/asm/unistd.h index 4a8fb427ce0a..be6ef0f5cd42 100644 --- a/arch/frv/include/asm/unistd.h +++ b/arch/frv/include/asm/unistd.h @@ -342,7 +342,7 @@ #define __NR_preadv 333 #define __NR_pwritev 334 #define __NR_rt_tgsigqueueinfo 335 -#define __NR_perf_counter_open 336 +#define __NR_perf_event_open 336 #ifdef __KERNEL__ diff --git a/arch/frv/kernel/entry.S b/arch/frv/kernel/entry.S index fde1e446b440..189397ec012a 100644 --- a/arch/frv/kernel/entry.S +++ b/arch/frv/kernel/entry.S @@ -1525,6 +1525,6 @@ sys_call_table: .long sys_preadv .long sys_pwritev .long sys_rt_tgsigqueueinfo /* 335 */ - .long sys_perf_counter_open + .long sys_perf_event_open syscall_table_size = (. - sys_call_table) diff --git a/arch/frv/lib/Makefile b/arch/frv/lib/Makefile index 0a377210c89b..f4709756d0d9 100644 --- a/arch/frv/lib/Makefile +++ b/arch/frv/lib/Makefile @@ -5,4 +5,4 @@ lib-y := \ __ashldi3.o __lshrdi3.o __muldi3.o __ashrdi3.o __negdi2.o __ucmpdi2.o \ checksum.o memcpy.o memset.o atomic-ops.o atomic64-ops.o \ - outsl_ns.o outsl_sw.o insl_ns.o insl_sw.o cache.o perf_counter.o + outsl_ns.o outsl_sw.o insl_ns.o insl_sw.o cache.o perf_event.o diff --git a/arch/frv/lib/perf_counter.c b/arch/frv/lib/perf_counter.c deleted file mode 100644 index 2000feecd571..000000000000 --- a/arch/frv/lib/perf_counter.c +++ /dev/null @@ -1,19 +0,0 @@ -/* Performance counter handling - * - * Copyright (C) 2009 Red Hat, Inc. All Rights Reserved. - * Written by David Howells (dhowells@redhat.com) - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public Licence - * as published by the Free Software Foundation; either version - * 2 of the Licence, or (at your option) any later version. - */ - -#include - -/* - * mark the performance counter as pending - */ -void set_perf_counter_pending(void) -{ -} diff --git a/arch/frv/lib/perf_event.c b/arch/frv/lib/perf_event.c new file mode 100644 index 000000000000..9ac5acfd2e91 --- /dev/null +++ b/arch/frv/lib/perf_event.c @@ -0,0 +1,19 @@ +/* Performance event handling + * + * Copyright (C) 2009 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public Licence + * as published by the Free Software Foundation; either version + * 2 of the Licence, or (at your option) any later version. + */ + +#include + +/* + * mark the performance event as pending + */ +void set_perf_event_pending(void) +{ +} diff --git a/arch/m68k/include/asm/unistd.h b/arch/m68k/include/asm/unistd.h index 946d8691f2b0..48b87f5ced50 100644 --- a/arch/m68k/include/asm/unistd.h +++ b/arch/m68k/include/asm/unistd.h @@ -335,7 +335,7 @@ #define __NR_preadv 329 #define __NR_pwritev 330 #define __NR_rt_tgsigqueueinfo 331 -#define __NR_perf_counter_open 332 +#define __NR_perf_event_open 332 #ifdef __KERNEL__ diff --git a/arch/m68k/kernel/entry.S b/arch/m68k/kernel/entry.S index 922f52e7ed1a..c5b33634c980 100644 --- a/arch/m68k/kernel/entry.S +++ b/arch/m68k/kernel/entry.S @@ -756,5 +756,5 @@ sys_call_table: .long sys_preadv .long sys_pwritev /* 330 */ .long sys_rt_tgsigqueueinfo - .long sys_perf_counter_open + .long sys_perf_event_open diff --git a/arch/m68knommu/kernel/syscalltable.S b/arch/m68knommu/kernel/syscalltable.S index 0ae123e08985..23535cc415ae 100644 --- a/arch/m68knommu/kernel/syscalltable.S +++ b/arch/m68knommu/kernel/syscalltable.S @@ -350,7 +350,7 @@ ENTRY(sys_call_table) .long sys_preadv .long sys_pwritev /* 330 */ .long sys_rt_tgsigqueueinfo - .long sys_perf_counter_open + .long sys_perf_event_open .rept NR_syscalls-(.-sys_call_table)/4 .long sys_ni_syscall diff --git a/arch/microblaze/include/asm/unistd.h b/arch/microblaze/include/asm/unistd.h index 0b852327c0e7..cb05a07e55e9 100644 --- a/arch/microblaze/include/asm/unistd.h +++ b/arch/microblaze/include/asm/unistd.h @@ -381,7 +381,7 @@ #define __NR_preadv 363 /* new */ #define __NR_pwritev 364 /* new */ #define __NR_rt_tgsigqueueinfo 365 /* new */ -#define __NR_perf_counter_open 366 /* new */ +#define __NR_perf_event_open 366 /* new */ #define __NR_syscalls 367 diff --git a/arch/microblaze/kernel/syscall_table.S b/arch/microblaze/kernel/syscall_table.S index 457216097dfd..ecec19155135 100644 --- a/arch/microblaze/kernel/syscall_table.S +++ b/arch/microblaze/kernel/syscall_table.S @@ -370,4 +370,4 @@ ENTRY(sys_call_table) .long sys_ni_syscall .long sys_ni_syscall .long sys_rt_tgsigqueueinfo /* 365 */ - .long sys_perf_counter_open + .long sys_perf_event_open diff --git a/arch/mips/include/asm/unistd.h b/arch/mips/include/asm/unistd.h index e753a777949b..8c9dfa9e9018 100644 --- a/arch/mips/include/asm/unistd.h +++ b/arch/mips/include/asm/unistd.h @@ -353,7 +353,7 @@ #define __NR_preadv (__NR_Linux + 330) #define __NR_pwritev (__NR_Linux + 331) #define __NR_rt_tgsigqueueinfo (__NR_Linux + 332) -#define __NR_perf_counter_open (__NR_Linux + 333) +#define __NR_perf_event_open (__NR_Linux + 333) #define __NR_accept4 (__NR_Linux + 334) /* @@ -664,7 +664,7 @@ #define __NR_preadv (__NR_Linux + 289) #define __NR_pwritev (__NR_Linux + 290) #define __NR_rt_tgsigqueueinfo (__NR_Linux + 291) -#define __NR_perf_counter_open (__NR_Linux + 292) +#define __NR_perf_event_open (__NR_Linux + 292) #define __NR_accept4 (__NR_Linux + 293) /* @@ -979,7 +979,7 @@ #define __NR_preadv (__NR_Linux + 293) #define __NR_pwritev (__NR_Linux + 294) #define __NR_rt_tgsigqueueinfo (__NR_Linux + 295) -#define __NR_perf_counter_open (__NR_Linux + 296) +#define __NR_perf_event_open (__NR_Linux + 296) #define __NR_accept4 (__NR_Linux + 297) /* diff --git a/arch/mips/kernel/scall32-o32.S b/arch/mips/kernel/scall32-o32.S index 7c2de4f091c4..fd2a9bb620d6 100644 --- a/arch/mips/kernel/scall32-o32.S +++ b/arch/mips/kernel/scall32-o32.S @@ -581,7 +581,7 @@ einval: li v0, -ENOSYS sys sys_preadv 6 /* 4330 */ sys sys_pwritev 6 sys sys_rt_tgsigqueueinfo 4 - sys sys_perf_counter_open 5 + sys sys_perf_event_open 5 sys sys_accept4 4 .endm diff --git a/arch/mips/kernel/scall64-64.S b/arch/mips/kernel/scall64-64.S index b97b993846d6..18bf7f32c5e4 100644 --- a/arch/mips/kernel/scall64-64.S +++ b/arch/mips/kernel/scall64-64.S @@ -418,6 +418,6 @@ sys_call_table: PTR sys_preadv PTR sys_pwritev /* 5390 */ PTR sys_rt_tgsigqueueinfo - PTR sys_perf_counter_open + PTR sys_perf_event_open PTR sys_accept4 .size sys_call_table,.-sys_call_table diff --git a/arch/mips/kernel/scall64-n32.S b/arch/mips/kernel/scall64-n32.S index 1a6ae124635b..6ebc07976694 100644 --- a/arch/mips/kernel/scall64-n32.S +++ b/arch/mips/kernel/scall64-n32.S @@ -416,6 +416,6 @@ EXPORT(sysn32_call_table) PTR sys_preadv PTR sys_pwritev PTR compat_sys_rt_tgsigqueueinfo /* 5295 */ - PTR sys_perf_counter_open + PTR sys_perf_event_open PTR sys_accept4 .size sysn32_call_table,.-sysn32_call_table diff --git a/arch/mips/kernel/scall64-o32.S b/arch/mips/kernel/scall64-o32.S index cd31087a651f..9bbf9775e0bd 100644 --- a/arch/mips/kernel/scall64-o32.S +++ b/arch/mips/kernel/scall64-o32.S @@ -536,6 +536,6 @@ sys_call_table: PTR compat_sys_preadv /* 4330 */ PTR compat_sys_pwritev PTR compat_sys_rt_tgsigqueueinfo - PTR sys_perf_counter_open + PTR sys_perf_event_open PTR sys_accept4 .size sys_call_table,.-sys_call_table diff --git a/arch/mn10300/include/asm/unistd.h b/arch/mn10300/include/asm/unistd.h index fad68616af32..2a983931c11f 100644 --- a/arch/mn10300/include/asm/unistd.h +++ b/arch/mn10300/include/asm/unistd.h @@ -347,7 +347,7 @@ #define __NR_preadv 334 #define __NR_pwritev 335 #define __NR_rt_tgsigqueueinfo 336 -#define __NR_perf_counter_open 337 +#define __NR_perf_event_open 337 #ifdef __KERNEL__ diff --git a/arch/mn10300/kernel/entry.S b/arch/mn10300/kernel/entry.S index e0d2563af4f2..a94e7ea3faa6 100644 --- a/arch/mn10300/kernel/entry.S +++ b/arch/mn10300/kernel/entry.S @@ -723,7 +723,7 @@ ENTRY(sys_call_table) .long sys_preadv .long sys_pwritev /* 335 */ .long sys_rt_tgsigqueueinfo - .long sys_perf_counter_open + .long sys_perf_event_open nr_syscalls=(.-sys_call_table)/4 diff --git a/arch/parisc/Kconfig b/arch/parisc/Kconfig index 06f8d5b5b0f9..f388dc68f605 100644 --- a/arch/parisc/Kconfig +++ b/arch/parisc/Kconfig @@ -16,7 +16,7 @@ config PARISC select RTC_DRV_GENERIC select INIT_ALL_POSSIBLE select BUG - select HAVE_PERF_COUNTERS + select HAVE_PERF_EVENTS select GENERIC_ATOMIC64 if !64BIT help The PA-RISC microprocessor is designed by Hewlett-Packard and used diff --git a/arch/parisc/include/asm/perf_counter.h b/arch/parisc/include/asm/perf_counter.h deleted file mode 100644 index dc9e829f7013..000000000000 --- a/arch/parisc/include/asm/perf_counter.h +++ /dev/null @@ -1,7 +0,0 @@ -#ifndef __ASM_PARISC_PERF_COUNTER_H -#define __ASM_PARISC_PERF_COUNTER_H - -/* parisc only supports software counters through this interface. */ -static inline void set_perf_counter_pending(void) { } - -#endif /* __ASM_PARISC_PERF_COUNTER_H */ diff --git a/arch/parisc/include/asm/perf_event.h b/arch/parisc/include/asm/perf_event.h new file mode 100644 index 000000000000..cc146427d8f9 --- /dev/null +++ b/arch/parisc/include/asm/perf_event.h @@ -0,0 +1,7 @@ +#ifndef __ASM_PARISC_PERF_EVENT_H +#define __ASM_PARISC_PERF_EVENT_H + +/* parisc only supports software events through this interface. */ +static inline void set_perf_event_pending(void) { } + +#endif /* __ASM_PARISC_PERF_EVENT_H */ diff --git a/arch/parisc/include/asm/unistd.h b/arch/parisc/include/asm/unistd.h index f3d3b8b012c4..cda158318c62 100644 --- a/arch/parisc/include/asm/unistd.h +++ b/arch/parisc/include/asm/unistd.h @@ -810,9 +810,9 @@ #define __NR_preadv (__NR_Linux + 315) #define __NR_pwritev (__NR_Linux + 316) #define __NR_rt_tgsigqueueinfo (__NR_Linux + 317) -#define __NR_perf_counter_open (__NR_Linux + 318) +#define __NR_perf_event_open (__NR_Linux + 318) -#define __NR_Linux_syscalls (__NR_perf_counter_open + 1) +#define __NR_Linux_syscalls (__NR_perf_event_open + 1) #define __IGNORE_select /* newselect */ diff --git a/arch/parisc/kernel/syscall_table.S b/arch/parisc/kernel/syscall_table.S index cf145eb026b3..843f423dec67 100644 --- a/arch/parisc/kernel/syscall_table.S +++ b/arch/parisc/kernel/syscall_table.S @@ -416,7 +416,7 @@ ENTRY_COMP(preadv) /* 315 */ ENTRY_COMP(pwritev) ENTRY_COMP(rt_tgsigqueueinfo) - ENTRY_SAME(perf_counter_open) + ENTRY_SAME(perf_event_open) /* Nothing yet */ diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index 8250902265c6..4fd479059d65 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -129,7 +129,7 @@ config PPC select HAVE_OPROFILE select HAVE_SYSCALL_WRAPPERS if PPC64 select GENERIC_ATOMIC64 if PPC32 - select HAVE_PERF_COUNTERS + select HAVE_PERF_EVENTS config EARLY_PRINTK bool diff --git a/arch/powerpc/include/asm/hw_irq.h b/arch/powerpc/include/asm/hw_irq.h index e73d554538dd..abbc2aaaced5 100644 --- a/arch/powerpc/include/asm/hw_irq.h +++ b/arch/powerpc/include/asm/hw_irq.h @@ -135,43 +135,43 @@ static inline int irqs_disabled_flags(unsigned long flags) */ struct irq_chip; -#ifdef CONFIG_PERF_COUNTERS +#ifdef CONFIG_PERF_EVENTS #ifdef CONFIG_PPC64 -static inline unsigned long test_perf_counter_pending(void) +static inline unsigned long test_perf_event_pending(void) { unsigned long x; asm volatile("lbz %0,%1(13)" : "=r" (x) - : "i" (offsetof(struct paca_struct, perf_counter_pending))); + : "i" (offsetof(struct paca_struct, perf_event_pending))); return x; } -static inline void set_perf_counter_pending(void) +static inline void set_perf_event_pending(void) { asm volatile("stb %0,%1(13)" : : "r" (1), - "i" (offsetof(struct paca_struct, perf_counter_pending))); + "i" (offsetof(struct paca_struct, perf_event_pending))); } -static inline void clear_perf_counter_pending(void) +static inline void clear_perf_event_pending(void) { asm volatile("stb %0,%1(13)" : : "r" (0), - "i" (offsetof(struct paca_struct, perf_counter_pending))); + "i" (offsetof(struct paca_struct, perf_event_pending))); } #endif /* CONFIG_PPC64 */ -#else /* CONFIG_PERF_COUNTERS */ +#else /* CONFIG_PERF_EVENTS */ -static inline unsigned long test_perf_counter_pending(void) +static inline unsigned long test_perf_event_pending(void) { return 0; } -static inline void clear_perf_counter_pending(void) {} -#endif /* CONFIG_PERF_COUNTERS */ +static inline void clear_perf_event_pending(void) {} +#endif /* CONFIG_PERF_EVENTS */ #endif /* __KERNEL__ */ #endif /* _ASM_POWERPC_HW_IRQ_H */ diff --git a/arch/powerpc/include/asm/paca.h b/arch/powerpc/include/asm/paca.h index b634456ea893..154f405b642f 100644 --- a/arch/powerpc/include/asm/paca.h +++ b/arch/powerpc/include/asm/paca.h @@ -122,7 +122,7 @@ struct paca_struct { u8 soft_enabled; /* irq soft-enable flag */ u8 hard_enabled; /* set if irqs are enabled in MSR */ u8 io_sync; /* writel() needs spin_unlock sync */ - u8 perf_counter_pending; /* PM interrupt while soft-disabled */ + u8 perf_event_pending; /* PM interrupt while soft-disabled */ /* Stuff for accurate time accounting */ u64 user_time; /* accumulated usermode TB ticks */ diff --git a/arch/powerpc/include/asm/perf_counter.h b/arch/powerpc/include/asm/perf_counter.h deleted file mode 100644 index 0ea0639fcf75..000000000000 --- a/arch/powerpc/include/asm/perf_counter.h +++ /dev/null @@ -1,110 +0,0 @@ -/* - * Performance counter support - PowerPC-specific definitions. - * - * Copyright 2008-2009 Paul Mackerras, IBM Corporation. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - */ -#include - -#include - -#define MAX_HWCOUNTERS 8 -#define MAX_EVENT_ALTERNATIVES 8 -#define MAX_LIMITED_HWCOUNTERS 2 - -/* - * This struct provides the constants and functions needed to - * describe the PMU on a particular POWER-family CPU. - */ -struct power_pmu { - const char *name; - int n_counter; - int max_alternatives; - unsigned long add_fields; - unsigned long test_adder; - int (*compute_mmcr)(u64 events[], int n_ev, - unsigned int hwc[], unsigned long mmcr[]); - int (*get_constraint)(u64 event, unsigned long *mskp, - unsigned long *valp); - int (*get_alternatives)(u64 event, unsigned int flags, - u64 alt[]); - void (*disable_pmc)(unsigned int pmc, unsigned long mmcr[]); - int (*limited_pmc_event)(u64 event); - u32 flags; - int n_generic; - int *generic_events; - int (*cache_events)[PERF_COUNT_HW_CACHE_MAX] - [PERF_COUNT_HW_CACHE_OP_MAX] - [PERF_COUNT_HW_CACHE_RESULT_MAX]; -}; - -/* - * Values for power_pmu.flags - */ -#define PPMU_LIMITED_PMC5_6 1 /* PMC5/6 have limited function */ -#define PPMU_ALT_SIPR 2 /* uses alternate posn for SIPR/HV */ - -/* - * Values for flags to get_alternatives() - */ -#define PPMU_LIMITED_PMC_OK 1 /* can put this on a limited PMC */ -#define PPMU_LIMITED_PMC_REQD 2 /* have to put this on a limited PMC */ -#define PPMU_ONLY_COUNT_RUN 4 /* only counting in run state */ - -extern int register_power_pmu(struct power_pmu *); - -struct pt_regs; -extern unsigned long perf_misc_flags(struct pt_regs *regs); -extern unsigned long perf_instruction_pointer(struct pt_regs *regs); - -#define PERF_COUNTER_INDEX_OFFSET 1 - -/* - * Only override the default definitions in include/linux/perf_counter.h - * if we have hardware PMU support. - */ -#ifdef CONFIG_PPC_PERF_CTRS -#define perf_misc_flags(regs) perf_misc_flags(regs) -#endif - -/* - * The power_pmu.get_constraint function returns a 32/64-bit value and - * a 32/64-bit mask that express the constraints between this event and - * other events. - * - * The value and mask are divided up into (non-overlapping) bitfields - * of three different types: - * - * Select field: this expresses the constraint that some set of bits - * in MMCR* needs to be set to a specific value for this event. For a - * select field, the mask contains 1s in every bit of the field, and - * the value contains a unique value for each possible setting of the - * MMCR* bits. The constraint checking code will ensure that two events - * that set the same field in their masks have the same value in their - * value dwords. - * - * Add field: this expresses the constraint that there can be at most - * N events in a particular class. A field of k bits can be used for - * N <= 2^(k-1) - 1. The mask has the most significant bit of the field - * set (and the other bits 0), and the value has only the least significant - * bit of the field set. In addition, the 'add_fields' and 'test_adder' - * in the struct power_pmu for this processor come into play. The - * add_fields value contains 1 in the LSB of the field, and the - * test_adder contains 2^(k-1) - 1 - N in the field. - * - * NAND field: this expresses the constraint that you may not have events - * in all of a set of classes. (For example, on PPC970, you can't select - * events from the FPU, ISU and IDU simultaneously, although any two are - * possible.) For N classes, the field is N+1 bits wide, and each class - * is assigned one bit from the least-significant N bits. The mask has - * only the most-significant bit set, and the value has only the bit - * for the event's class set. The test_adder has the least significant - * bit set in the field. - * - * If an event is not subject to the constraint expressed by a particular - * field, then it will have 0 in both the mask and value for that field. - */ diff --git a/arch/powerpc/include/asm/perf_event.h b/arch/powerpc/include/asm/perf_event.h new file mode 100644 index 000000000000..2499aaadaeb9 --- /dev/null +++ b/arch/powerpc/include/asm/perf_event.h @@ -0,0 +1,110 @@ +/* + * Performance event support - PowerPC-specific definitions. + * + * Copyright 2008-2009 Paul Mackerras, IBM Corporation. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ +#include + +#include + +#define MAX_HWEVENTS 8 +#define MAX_EVENT_ALTERNATIVES 8 +#define MAX_LIMITED_HWEVENTS 2 + +/* + * This struct provides the constants and functions needed to + * describe the PMU on a particular POWER-family CPU. + */ +struct power_pmu { + const char *name; + int n_event; + int max_alternatives; + unsigned long add_fields; + unsigned long test_adder; + int (*compute_mmcr)(u64 events[], int n_ev, + unsigned int hwc[], unsigned long mmcr[]); + int (*get_constraint)(u64 event_id, unsigned long *mskp, + unsigned long *valp); + int (*get_alternatives)(u64 event_id, unsigned int flags, + u64 alt[]); + void (*disable_pmc)(unsigned int pmc, unsigned long mmcr[]); + int (*limited_pmc_event)(u64 event_id); + u32 flags; + int n_generic; + int *generic_events; + int (*cache_events)[PERF_COUNT_HW_CACHE_MAX] + [PERF_COUNT_HW_CACHE_OP_MAX] + [PERF_COUNT_HW_CACHE_RESULT_MAX]; +}; + +/* + * Values for power_pmu.flags + */ +#define PPMU_LIMITED_PMC5_6 1 /* PMC5/6 have limited function */ +#define PPMU_ALT_SIPR 2 /* uses alternate posn for SIPR/HV */ + +/* + * Values for flags to get_alternatives() + */ +#define PPMU_LIMITED_PMC_OK 1 /* can put this on a limited PMC */ +#define PPMU_LIMITED_PMC_REQD 2 /* have to put this on a limited PMC */ +#define PPMU_ONLY_COUNT_RUN 4 /* only counting in run state */ + +extern int register_power_pmu(struct power_pmu *); + +struct pt_regs; +extern unsigned long perf_misc_flags(struct pt_regs *regs); +extern unsigned long perf_instruction_pointer(struct pt_regs *regs); + +#define PERF_EVENT_INDEX_OFFSET 1 + +/* + * Only override the default definitions in include/linux/perf_event.h + * if we have hardware PMU support. + */ +#ifdef CONFIG_PPC_PERF_CTRS +#define perf_misc_flags(regs) perf_misc_flags(regs) +#endif + +/* + * The power_pmu.get_constraint function returns a 32/64-bit value and + * a 32/64-bit mask that express the constraints between this event_id and + * other events. + * + * The value and mask are divided up into (non-overlapping) bitfields + * of three different types: + * + * Select field: this expresses the constraint that some set of bits + * in MMCR* needs to be set to a specific value for this event_id. For a + * select field, the mask contains 1s in every bit of the field, and + * the value contains a unique value for each possible setting of the + * MMCR* bits. The constraint checking code will ensure that two events + * that set the same field in their masks have the same value in their + * value dwords. + * + * Add field: this expresses the constraint that there can be at most + * N events in a particular class. A field of k bits can be used for + * N <= 2^(k-1) - 1. The mask has the most significant bit of the field + * set (and the other bits 0), and the value has only the least significant + * bit of the field set. In addition, the 'add_fields' and 'test_adder' + * in the struct power_pmu for this processor come into play. The + * add_fields value contains 1 in the LSB of the field, and the + * test_adder contains 2^(k-1) - 1 - N in the field. + * + * NAND field: this expresses the constraint that you may not have events + * in all of a set of classes. (For example, on PPC970, you can't select + * events from the FPU, ISU and IDU simultaneously, although any two are + * possible.) For N classes, the field is N+1 bits wide, and each class + * is assigned one bit from the least-significant N bits. The mask has + * only the most-significant bit set, and the value has only the bit + * for the event_id's class set. The test_adder has the least significant + * bit set in the field. + * + * If an event_id is not subject to the constraint expressed by a particular + * field, then it will have 0 in both the mask and value for that field. + */ diff --git a/arch/powerpc/include/asm/systbl.h b/arch/powerpc/include/asm/systbl.h index ed24bd92fe49..c7d671a7d9a1 100644 --- a/arch/powerpc/include/asm/systbl.h +++ b/arch/powerpc/include/asm/systbl.h @@ -322,7 +322,7 @@ SYSCALL_SPU(epoll_create1) SYSCALL_SPU(dup3) SYSCALL_SPU(pipe2) SYSCALL(inotify_init1) -SYSCALL_SPU(perf_counter_open) +SYSCALL_SPU(perf_event_open) COMPAT_SYS_SPU(preadv) COMPAT_SYS_SPU(pwritev) COMPAT_SYS(rt_tgsigqueueinfo) diff --git a/arch/powerpc/include/asm/unistd.h b/arch/powerpc/include/asm/unistd.h index cef080bfc607..f6ca76176766 100644 --- a/arch/powerpc/include/asm/unistd.h +++ b/arch/powerpc/include/asm/unistd.h @@ -341,7 +341,7 @@ #define __NR_dup3 316 #define __NR_pipe2 317 #define __NR_inotify_init1 318 -#define __NR_perf_counter_open 319 +#define __NR_perf_event_open 319 #define __NR_preadv 320 #define __NR_pwritev 321 #define __NR_rt_tgsigqueueinfo 322 diff --git a/arch/powerpc/kernel/Makefile b/arch/powerpc/kernel/Makefile index 569f79ccd310..b23664a0b86c 100644 --- a/arch/powerpc/kernel/Makefile +++ b/arch/powerpc/kernel/Makefile @@ -97,7 +97,7 @@ obj64-$(CONFIG_AUDIT) += compat_audit.o obj-$(CONFIG_DYNAMIC_FTRACE) += ftrace.o obj-$(CONFIG_FUNCTION_GRAPH_TRACER) += ftrace.o -obj-$(CONFIG_PPC_PERF_CTRS) += perf_counter.o perf_callchain.o +obj-$(CONFIG_PPC_PERF_CTRS) += perf_event.o perf_callchain.o obj64-$(CONFIG_PPC_PERF_CTRS) += power4-pmu.o ppc970-pmu.o power5-pmu.o \ power5+-pmu.o power6-pmu.o power7-pmu.o obj32-$(CONFIG_PPC_PERF_CTRS) += mpc7450-pmu.o diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c index f0df285f0f87..0812b0f414bb 100644 --- a/arch/powerpc/kernel/asm-offsets.c +++ b/arch/powerpc/kernel/asm-offsets.c @@ -133,7 +133,7 @@ int main(void) DEFINE(PACAKMSR, offsetof(struct paca_struct, kernel_msr)); DEFINE(PACASOFTIRQEN, offsetof(struct paca_struct, soft_enabled)); DEFINE(PACAHARDIRQEN, offsetof(struct paca_struct, hard_enabled)); - DEFINE(PACAPERFPEND, offsetof(struct paca_struct, perf_counter_pending)); + DEFINE(PACAPERFPEND, offsetof(struct paca_struct, perf_event_pending)); DEFINE(PACACONTEXTID, offsetof(struct paca_struct, context.id)); #ifdef CONFIG_PPC_MM_SLICES DEFINE(PACALOWSLICESPSIZE, offsetof(struct paca_struct, diff --git a/arch/powerpc/kernel/entry_64.S b/arch/powerpc/kernel/entry_64.S index 66bcda34a6bb..900e0eea0099 100644 --- a/arch/powerpc/kernel/entry_64.S +++ b/arch/powerpc/kernel/entry_64.S @@ -556,14 +556,14 @@ ALT_FW_FTR_SECTION_END_IFCLR(FW_FEATURE_ISERIES) 2: TRACE_AND_RESTORE_IRQ(r5); -#ifdef CONFIG_PERF_COUNTERS - /* check paca->perf_counter_pending if we're enabling ints */ +#ifdef CONFIG_PERF_EVENTS + /* check paca->perf_event_pending if we're enabling ints */ lbz r3,PACAPERFPEND(r13) and. r3,r3,r5 beq 27f - bl .perf_counter_do_pending + bl .perf_event_do_pending 27: -#endif /* CONFIG_PERF_COUNTERS */ +#endif /* CONFIG_PERF_EVENTS */ /* extract EE bit and use it to restore paca->hard_enabled */ ld r3,_MSR(r1) diff --git a/arch/powerpc/kernel/irq.c b/arch/powerpc/kernel/irq.c index f7f376ea7b17..e5d121177984 100644 --- a/arch/powerpc/kernel/irq.c +++ b/arch/powerpc/kernel/irq.c @@ -53,7 +53,7 @@ #include #include #include -#include +#include #include #include @@ -138,9 +138,9 @@ notrace void raw_local_irq_restore(unsigned long en) } #endif /* CONFIG_PPC_STD_MMU_64 */ - if (test_perf_counter_pending()) { - clear_perf_counter_pending(); - perf_counter_do_pending(); + if (test_perf_event_pending()) { + clear_perf_event_pending(); + perf_event_do_pending(); } /* diff --git a/arch/powerpc/kernel/mpc7450-pmu.c b/arch/powerpc/kernel/mpc7450-pmu.c index cc466d039af6..09d72028f317 100644 --- a/arch/powerpc/kernel/mpc7450-pmu.c +++ b/arch/powerpc/kernel/mpc7450-pmu.c @@ -9,7 +9,7 @@ * 2 of the License, or (at your option) any later version. */ #include -#include +#include #include #include diff --git a/arch/powerpc/kernel/perf_callchain.c b/arch/powerpc/kernel/perf_callchain.c index f74b62c67511..0a03cf70d247 100644 --- a/arch/powerpc/kernel/perf_callchain.c +++ b/arch/powerpc/kernel/perf_callchain.c @@ -10,7 +10,7 @@ */ #include #include -#include +#include #include #include #include diff --git a/arch/powerpc/kernel/perf_counter.c b/arch/powerpc/kernel/perf_counter.c deleted file mode 100644 index 5ccf9bca96c0..000000000000 --- a/arch/powerpc/kernel/perf_counter.c +++ /dev/null @@ -1,1315 +0,0 @@ -/* - * Performance counter support - powerpc architecture code - * - * Copyright 2008-2009 Paul Mackerras, IBM Corporation. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - */ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -struct cpu_hw_counters { - int n_counters; - int n_percpu; - int disabled; - int n_added; - int n_limited; - u8 pmcs_enabled; - struct perf_counter *counter[MAX_HWCOUNTERS]; - u64 events[MAX_HWCOUNTERS]; - unsigned int flags[MAX_HWCOUNTERS]; - unsigned long mmcr[3]; - struct perf_counter *limited_counter[MAX_LIMITED_HWCOUNTERS]; - u8 limited_hwidx[MAX_LIMITED_HWCOUNTERS]; - u64 alternatives[MAX_HWCOUNTERS][MAX_EVENT_ALTERNATIVES]; - unsigned long amasks[MAX_HWCOUNTERS][MAX_EVENT_ALTERNATIVES]; - unsigned long avalues[MAX_HWCOUNTERS][MAX_EVENT_ALTERNATIVES]; -}; -DEFINE_PER_CPU(struct cpu_hw_counters, cpu_hw_counters); - -struct power_pmu *ppmu; - -/* - * Normally, to ignore kernel events we set the FCS (freeze counters - * in supervisor mode) bit in MMCR0, but if the kernel runs with the - * hypervisor bit set in the MSR, or if we are running on a processor - * where the hypervisor bit is forced to 1 (as on Apple G5 processors), - * then we need to use the FCHV bit to ignore kernel events. - */ -static unsigned int freeze_counters_kernel = MMCR0_FCS; - -/* - * 32-bit doesn't have MMCRA but does have an MMCR2, - * and a few other names are different. - */ -#ifdef CONFIG_PPC32 - -#define MMCR0_FCHV 0 -#define MMCR0_PMCjCE MMCR0_PMCnCE - -#define SPRN_MMCRA SPRN_MMCR2 -#define MMCRA_SAMPLE_ENABLE 0 - -static inline unsigned long perf_ip_adjust(struct pt_regs *regs) -{ - return 0; -} -static inline void perf_get_data_addr(struct pt_regs *regs, u64 *addrp) { } -static inline u32 perf_get_misc_flags(struct pt_regs *regs) -{ - return 0; -} -static inline void perf_read_regs(struct pt_regs *regs) { } -static inline int perf_intr_is_nmi(struct pt_regs *regs) -{ - return 0; -} - -#endif /* CONFIG_PPC32 */ - -/* - * Things that are specific to 64-bit implementations. - */ -#ifdef CONFIG_PPC64 - -static inline unsigned long perf_ip_adjust(struct pt_regs *regs) -{ - unsigned long mmcra = regs->dsisr; - - if ((mmcra & MMCRA_SAMPLE_ENABLE) && !(ppmu->flags & PPMU_ALT_SIPR)) { - unsigned long slot = (mmcra & MMCRA_SLOT) >> MMCRA_SLOT_SHIFT; - if (slot > 1) - return 4 * (slot - 1); - } - return 0; -} - -/* - * The user wants a data address recorded. - * If we're not doing instruction sampling, give them the SDAR - * (sampled data address). If we are doing instruction sampling, then - * only give them the SDAR if it corresponds to the instruction - * pointed to by SIAR; this is indicated by the [POWER6_]MMCRA_SDSYNC - * bit in MMCRA. - */ -static inline void perf_get_data_addr(struct pt_regs *regs, u64 *addrp) -{ - unsigned long mmcra = regs->dsisr; - unsigned long sdsync = (ppmu->flags & PPMU_ALT_SIPR) ? - POWER6_MMCRA_SDSYNC : MMCRA_SDSYNC; - - if (!(mmcra & MMCRA_SAMPLE_ENABLE) || (mmcra & sdsync)) - *addrp = mfspr(SPRN_SDAR); -} - -static inline u32 perf_get_misc_flags(struct pt_regs *regs) -{ - unsigned long mmcra = regs->dsisr; - - if (TRAP(regs) != 0xf00) - return 0; /* not a PMU interrupt */ - - if (ppmu->flags & PPMU_ALT_SIPR) { - if (mmcra & POWER6_MMCRA_SIHV) - return PERF_EVENT_MISC_HYPERVISOR; - return (mmcra & POWER6_MMCRA_SIPR) ? - PERF_EVENT_MISC_USER : PERF_EVENT_MISC_KERNEL; - } - if (mmcra & MMCRA_SIHV) - return PERF_EVENT_MISC_HYPERVISOR; - return (mmcra & MMCRA_SIPR) ? PERF_EVENT_MISC_USER : - PERF_EVENT_MISC_KERNEL; -} - -/* - * Overload regs->dsisr to store MMCRA so we only need to read it once - * on each interrupt. - */ -static inline void perf_read_regs(struct pt_regs *regs) -{ - regs->dsisr = mfspr(SPRN_MMCRA); -} - -/* - * If interrupts were soft-disabled when a PMU interrupt occurs, treat - * it as an NMI. - */ -static inline int perf_intr_is_nmi(struct pt_regs *regs) -{ - return !regs->softe; -} - -#endif /* CONFIG_PPC64 */ - -static void perf_counter_interrupt(struct pt_regs *regs); - -void perf_counter_print_debug(void) -{ -} - -/* - * Read one performance monitor counter (PMC). - */ -static unsigned long read_pmc(int idx) -{ - unsigned long val; - - switch (idx) { - case 1: - val = mfspr(SPRN_PMC1); - break; - case 2: - val = mfspr(SPRN_PMC2); - break; - case 3: - val = mfspr(SPRN_PMC3); - break; - case 4: - val = mfspr(SPRN_PMC4); - break; - case 5: - val = mfspr(SPRN_PMC5); - break; - case 6: - val = mfspr(SPRN_PMC6); - break; -#ifdef CONFIG_PPC64 - case 7: - val = mfspr(SPRN_PMC7); - break; - case 8: - val = mfspr(SPRN_PMC8); - break; -#endif /* CONFIG_PPC64 */ - default: - printk(KERN_ERR "oops trying to read PMC%d\n", idx); - val = 0; - } - return val; -} - -/* - * Write one PMC. - */ -static void write_pmc(int idx, unsigned long val) -{ - switch (idx) { - case 1: - mtspr(SPRN_PMC1, val); - break; - case 2: - mtspr(SPRN_PMC2, val); - break; - case 3: - mtspr(SPRN_PMC3, val); - break; - case 4: - mtspr(SPRN_PMC4, val); - break; - case 5: - mtspr(SPRN_PMC5, val); - break; - case 6: - mtspr(SPRN_PMC6, val); - break; -#ifdef CONFIG_PPC64 - case 7: - mtspr(SPRN_PMC7, val); - break; - case 8: - mtspr(SPRN_PMC8, val); - break; -#endif /* CONFIG_PPC64 */ - default: - printk(KERN_ERR "oops trying to write PMC%d\n", idx); - } -} - -/* - * Check if a set of events can all go on the PMU at once. - * If they can't, this will look at alternative codes for the events - * and see if any combination of alternative codes is feasible. - * The feasible set is returned in event[]. - */ -static int power_check_constraints(struct cpu_hw_counters *cpuhw, - u64 event[], unsigned int cflags[], - int n_ev) -{ - unsigned long mask, value, nv; - unsigned long smasks[MAX_HWCOUNTERS], svalues[MAX_HWCOUNTERS]; - int n_alt[MAX_HWCOUNTERS], choice[MAX_HWCOUNTERS]; - int i, j; - unsigned long addf = ppmu->add_fields; - unsigned long tadd = ppmu->test_adder; - - if (n_ev > ppmu->n_counter) - return -1; - - /* First see if the events will go on as-is */ - for (i = 0; i < n_ev; ++i) { - if ((cflags[i] & PPMU_LIMITED_PMC_REQD) - && !ppmu->limited_pmc_event(event[i])) { - ppmu->get_alternatives(event[i], cflags[i], - cpuhw->alternatives[i]); - event[i] = cpuhw->alternatives[i][0]; - } - if (ppmu->get_constraint(event[i], &cpuhw->amasks[i][0], - &cpuhw->avalues[i][0])) - return -1; - } - value = mask = 0; - for (i = 0; i < n_ev; ++i) { - nv = (value | cpuhw->avalues[i][0]) + - (value & cpuhw->avalues[i][0] & addf); - if ((((nv + tadd) ^ value) & mask) != 0 || - (((nv + tadd) ^ cpuhw->avalues[i][0]) & - cpuhw->amasks[i][0]) != 0) - break; - value = nv; - mask |= cpuhw->amasks[i][0]; - } - if (i == n_ev) - return 0; /* all OK */ - - /* doesn't work, gather alternatives... */ - if (!ppmu->get_alternatives) - return -1; - for (i = 0; i < n_ev; ++i) { - choice[i] = 0; - n_alt[i] = ppmu->get_alternatives(event[i], cflags[i], - cpuhw->alternatives[i]); - for (j = 1; j < n_alt[i]; ++j) - ppmu->get_constraint(cpuhw->alternatives[i][j], - &cpuhw->amasks[i][j], - &cpuhw->avalues[i][j]); - } - - /* enumerate all possibilities and see if any will work */ - i = 0; - j = -1; - value = mask = nv = 0; - while (i < n_ev) { - if (j >= 0) { - /* we're backtracking, restore context */ - value = svalues[i]; - mask = smasks[i]; - j = choice[i]; - } - /* - * See if any alternative k for event i, - * where k > j, will satisfy the constraints. - */ - while (++j < n_alt[i]) { - nv = (value | cpuhw->avalues[i][j]) + - (value & cpuhw->avalues[i][j] & addf); - if ((((nv + tadd) ^ value) & mask) == 0 && - (((nv + tadd) ^ cpuhw->avalues[i][j]) - & cpuhw->amasks[i][j]) == 0) - break; - } - if (j >= n_alt[i]) { - /* - * No feasible alternative, backtrack - * to event i-1 and continue enumerating its - * alternatives from where we got up to. - */ - if (--i < 0) - return -1; - } else { - /* - * Found a feasible alternative for event i, - * remember where we got up to with this event, - * go on to the next event, and start with - * the first alternative for it. - */ - choice[i] = j; - svalues[i] = value; - smasks[i] = mask; - value = nv; - mask |= cpuhw->amasks[i][j]; - ++i; - j = -1; - } - } - - /* OK, we have a feasible combination, tell the caller the solution */ - for (i = 0; i < n_ev; ++i) - event[i] = cpuhw->alternatives[i][choice[i]]; - return 0; -} - -/* - * Check if newly-added counters have consistent settings for - * exclude_{user,kernel,hv} with each other and any previously - * added counters. - */ -static int check_excludes(struct perf_counter **ctrs, unsigned int cflags[], - int n_prev, int n_new) -{ - int eu = 0, ek = 0, eh = 0; - int i, n, first; - struct perf_counter *counter; - - n = n_prev + n_new; - if (n <= 1) - return 0; - - first = 1; - for (i = 0; i < n; ++i) { - if (cflags[i] & PPMU_LIMITED_PMC_OK) { - cflags[i] &= ~PPMU_LIMITED_PMC_REQD; - continue; - } - counter = ctrs[i]; - if (first) { - eu = counter->attr.exclude_user; - ek = counter->attr.exclude_kernel; - eh = counter->attr.exclude_hv; - first = 0; - } else if (counter->attr.exclude_user != eu || - counter->attr.exclude_kernel != ek || - counter->attr.exclude_hv != eh) { - return -EAGAIN; - } - } - - if (eu || ek || eh) - for (i = 0; i < n; ++i) - if (cflags[i] & PPMU_LIMITED_PMC_OK) - cflags[i] |= PPMU_LIMITED_PMC_REQD; - - return 0; -} - -static void power_pmu_read(struct perf_counter *counter) -{ - s64 val, delta, prev; - - if (!counter->hw.idx) - return; - /* - * Performance monitor interrupts come even when interrupts - * are soft-disabled, as long as interrupts are hard-enabled. - * Therefore we treat them like NMIs. - */ - do { - prev = atomic64_read(&counter->hw.prev_count); - barrier(); - val = read_pmc(counter->hw.idx); - } while (atomic64_cmpxchg(&counter->hw.prev_count, prev, val) != prev); - - /* The counters are only 32 bits wide */ - delta = (val - prev) & 0xfffffffful; - atomic64_add(delta, &counter->count); - atomic64_sub(delta, &counter->hw.period_left); -} - -/* - * On some machines, PMC5 and PMC6 can't be written, don't respect - * the freeze conditions, and don't generate interrupts. This tells - * us if `counter' is using such a PMC. - */ -static int is_limited_pmc(int pmcnum) -{ - return (ppmu->flags & PPMU_LIMITED_PMC5_6) - && (pmcnum == 5 || pmcnum == 6); -} - -static void freeze_limited_counters(struct cpu_hw_counters *cpuhw, - unsigned long pmc5, unsigned long pmc6) -{ - struct perf_counter *counter; - u64 val, prev, delta; - int i; - - for (i = 0; i < cpuhw->n_limited; ++i) { - counter = cpuhw->limited_counter[i]; - if (!counter->hw.idx) - continue; - val = (counter->hw.idx == 5) ? pmc5 : pmc6; - prev = atomic64_read(&counter->hw.prev_count); - counter->hw.idx = 0; - delta = (val - prev) & 0xfffffffful; - atomic64_add(delta, &counter->count); - } -} - -static void thaw_limited_counters(struct cpu_hw_counters *cpuhw, - unsigned long pmc5, unsigned long pmc6) -{ - struct perf_counter *counter; - u64 val; - int i; - - for (i = 0; i < cpuhw->n_limited; ++i) { - counter = cpuhw->limited_counter[i]; - counter->hw.idx = cpuhw->limited_hwidx[i]; - val = (counter->hw.idx == 5) ? pmc5 : pmc6; - atomic64_set(&counter->hw.prev_count, val); - perf_counter_update_userpage(counter); - } -} - -/* - * Since limited counters don't respect the freeze conditions, we - * have to read them immediately after freezing or unfreezing the - * other counters. We try to keep the values from the limited - * counters as consistent as possible by keeping the delay (in - * cycles and instructions) between freezing/unfreezing and reading - * the limited counters as small and consistent as possible. - * Therefore, if any limited counters are in use, we read them - * both, and always in the same order, to minimize variability, - * and do it inside the same asm that writes MMCR0. - */ -static void write_mmcr0(struct cpu_hw_counters *cpuhw, unsigned long mmcr0) -{ - unsigned long pmc5, pmc6; - - if (!cpuhw->n_limited) { - mtspr(SPRN_MMCR0, mmcr0); - return; - } - - /* - * Write MMCR0, then read PMC5 and PMC6 immediately. - * To ensure we don't get a performance monitor interrupt - * between writing MMCR0 and freezing/thawing the limited - * counters, we first write MMCR0 with the counter overflow - * interrupt enable bits turned off. - */ - asm volatile("mtspr %3,%2; mfspr %0,%4; mfspr %1,%5" - : "=&r" (pmc5), "=&r" (pmc6) - : "r" (mmcr0 & ~(MMCR0_PMC1CE | MMCR0_PMCjCE)), - "i" (SPRN_MMCR0), - "i" (SPRN_PMC5), "i" (SPRN_PMC6)); - - if (mmcr0 & MMCR0_FC) - freeze_limited_counters(cpuhw, pmc5, pmc6); - else - thaw_limited_counters(cpuhw, pmc5, pmc6); - - /* - * Write the full MMCR0 including the counter overflow interrupt - * enable bits, if necessary. - */ - if (mmcr0 & (MMCR0_PMC1CE | MMCR0_PMCjCE)) - mtspr(SPRN_MMCR0, mmcr0); -} - -/* - * Disable all counters to prevent PMU interrupts and to allow - * counters to be added or removed. - */ -void hw_perf_disable(void) -{ - struct cpu_hw_counters *cpuhw; - unsigned long flags; - - if (!ppmu) - return; - local_irq_save(flags); - cpuhw = &__get_cpu_var(cpu_hw_counters); - - if (!cpuhw->disabled) { - cpuhw->disabled = 1; - cpuhw->n_added = 0; - - /* - * Check if we ever enabled the PMU on this cpu. - */ - if (!cpuhw->pmcs_enabled) { - ppc_enable_pmcs(); - cpuhw->pmcs_enabled = 1; - } - - /* - * Disable instruction sampling if it was enabled - */ - if (cpuhw->mmcr[2] & MMCRA_SAMPLE_ENABLE) { - mtspr(SPRN_MMCRA, - cpuhw->mmcr[2] & ~MMCRA_SAMPLE_ENABLE); - mb(); - } - - /* - * Set the 'freeze counters' bit. - * The barrier is to make sure the mtspr has been - * executed and the PMU has frozen the counters - * before we return. - */ - write_mmcr0(cpuhw, mfspr(SPRN_MMCR0) | MMCR0_FC); - mb(); - } - local_irq_restore(flags); -} - -/* - * Re-enable all counters if disable == 0. - * If we were previously disabled and counters were added, then - * put the new config on the PMU. - */ -void hw_perf_enable(void) -{ - struct perf_counter *counter; - struct cpu_hw_counters *cpuhw; - unsigned long flags; - long i; - unsigned long val; - s64 left; - unsigned int hwc_index[MAX_HWCOUNTERS]; - int n_lim; - int idx; - - if (!ppmu) - return; - local_irq_save(flags); - cpuhw = &__get_cpu_var(cpu_hw_counters); - if (!cpuhw->disabled) { - local_irq_restore(flags); - return; - } - cpuhw->disabled = 0; - - /* - * If we didn't change anything, or only removed counters, - * no need to recalculate MMCR* settings and reset the PMCs. - * Just reenable the PMU with the current MMCR* settings - * (possibly updated for removal of counters). - */ - if (!cpuhw->n_added) { - mtspr(SPRN_MMCRA, cpuhw->mmcr[2] & ~MMCRA_SAMPLE_ENABLE); - mtspr(SPRN_MMCR1, cpuhw->mmcr[1]); - if (cpuhw->n_counters == 0) - ppc_set_pmu_inuse(0); - goto out_enable; - } - - /* - * Compute MMCR* values for the new set of counters - */ - if (ppmu->compute_mmcr(cpuhw->events, cpuhw->n_counters, hwc_index, - cpuhw->mmcr)) { - /* shouldn't ever get here */ - printk(KERN_ERR "oops compute_mmcr failed\n"); - goto out; - } - - /* - * Add in MMCR0 freeze bits corresponding to the - * attr.exclude_* bits for the first counter. - * We have already checked that all counters have the - * same values for these bits as the first counter. - */ - counter = cpuhw->counter[0]; - if (counter->attr.exclude_user) - cpuhw->mmcr[0] |= MMCR0_FCP; - if (counter->attr.exclude_kernel) - cpuhw->mmcr[0] |= freeze_counters_kernel; - if (counter->attr.exclude_hv) - cpuhw->mmcr[0] |= MMCR0_FCHV; - - /* - * Write the new configuration to MMCR* with the freeze - * bit set and set the hardware counters to their initial values. - * Then unfreeze the counters. - */ - ppc_set_pmu_inuse(1); - mtspr(SPRN_MMCRA, cpuhw->mmcr[2] & ~MMCRA_SAMPLE_ENABLE); - mtspr(SPRN_MMCR1, cpuhw->mmcr[1]); - mtspr(SPRN_MMCR0, (cpuhw->mmcr[0] & ~(MMCR0_PMC1CE | MMCR0_PMCjCE)) - | MMCR0_FC); - - /* - * Read off any pre-existing counters that need to move - * to another PMC. - */ - for (i = 0; i < cpuhw->n_counters; ++i) { - counter = cpuhw->counter[i]; - if (counter->hw.idx && counter->hw.idx != hwc_index[i] + 1) { - power_pmu_read(counter); - write_pmc(counter->hw.idx, 0); - counter->hw.idx = 0; - } - } - - /* - * Initialize the PMCs for all the new and moved counters. - */ - cpuhw->n_limited = n_lim = 0; - for (i = 0; i < cpuhw->n_counters; ++i) { - counter = cpuhw->counter[i]; - if (counter->hw.idx) - continue; - idx = hwc_index[i] + 1; - if (is_limited_pmc(idx)) { - cpuhw->limited_counter[n_lim] = counter; - cpuhw->limited_hwidx[n_lim] = idx; - ++n_lim; - continue; - } - val = 0; - if (counter->hw.sample_period) { - left = atomic64_read(&counter->hw.period_left); - if (left < 0x80000000L) - val = 0x80000000L - left; - } - atomic64_set(&counter->hw.prev_count, val); - counter->hw.idx = idx; - write_pmc(idx, val); - perf_counter_update_userpage(counter); - } - cpuhw->n_limited = n_lim; - cpuhw->mmcr[0] |= MMCR0_PMXE | MMCR0_FCECE; - - out_enable: - mb(); - write_mmcr0(cpuhw, cpuhw->mmcr[0]); - - /* - * Enable instruction sampling if necessary - */ - if (cpuhw->mmcr[2] & MMCRA_SAMPLE_ENABLE) { - mb(); - mtspr(SPRN_MMCRA, cpuhw->mmcr[2]); - } - - out: - local_irq_restore(flags); -} - -static int collect_events(struct perf_counter *group, int max_count, - struct perf_counter *ctrs[], u64 *events, - unsigned int *flags) -{ - int n = 0; - struct perf_counter *counter; - - if (!is_software_counter(group)) { - if (n >= max_count) - return -1; - ctrs[n] = group; - flags[n] = group->hw.counter_base; - events[n++] = group->hw.config; - } - list_for_each_entry(counter, &group->sibling_list, list_entry) { - if (!is_software_counter(counter) && - counter->state != PERF_COUNTER_STATE_OFF) { - if (n >= max_count) - return -1; - ctrs[n] = counter; - flags[n] = counter->hw.counter_base; - events[n++] = counter->hw.config; - } - } - return n; -} - -static void counter_sched_in(struct perf_counter *counter, int cpu) -{ - counter->state = PERF_COUNTER_STATE_ACTIVE; - counter->oncpu = cpu; - counter->tstamp_running += counter->ctx->time - counter->tstamp_stopped; - if (is_software_counter(counter)) - counter->pmu->enable(counter); -} - -/* - * Called to enable a whole group of counters. - * Returns 1 if the group was enabled, or -EAGAIN if it could not be. - * Assumes the caller has disabled interrupts and has - * frozen the PMU with hw_perf_save_disable. - */ -int hw_perf_group_sched_in(struct perf_counter *group_leader, - struct perf_cpu_context *cpuctx, - struct perf_counter_context *ctx, int cpu) -{ - struct cpu_hw_counters *cpuhw; - long i, n, n0; - struct perf_counter *sub; - - if (!ppmu) - return 0; - cpuhw = &__get_cpu_var(cpu_hw_counters); - n0 = cpuhw->n_counters; - n = collect_events(group_leader, ppmu->n_counter - n0, - &cpuhw->counter[n0], &cpuhw->events[n0], - &cpuhw->flags[n0]); - if (n < 0) - return -EAGAIN; - if (check_excludes(cpuhw->counter, cpuhw->flags, n0, n)) - return -EAGAIN; - i = power_check_constraints(cpuhw, cpuhw->events, cpuhw->flags, n + n0); - if (i < 0) - return -EAGAIN; - cpuhw->n_counters = n0 + n; - cpuhw->n_added += n; - - /* - * OK, this group can go on; update counter states etc., - * and enable any software counters - */ - for (i = n0; i < n0 + n; ++i) - cpuhw->counter[i]->hw.config = cpuhw->events[i]; - cpuctx->active_oncpu += n; - n = 1; - counter_sched_in(group_leader, cpu); - list_for_each_entry(sub, &group_leader->sibling_list, list_entry) { - if (sub->state != PERF_COUNTER_STATE_OFF) { - counter_sched_in(sub, cpu); - ++n; - } - } - ctx->nr_active += n; - - return 1; -} - -/* - * Add a counter to the PMU. - * If all counters are not already frozen, then we disable and - * re-enable the PMU in order to get hw_perf_enable to do the - * actual work of reconfiguring the PMU. - */ -static int power_pmu_enable(struct perf_counter *counter) -{ - struct cpu_hw_counters *cpuhw; - unsigned long flags; - int n0; - int ret = -EAGAIN; - - local_irq_save(flags); - perf_disable(); - - /* - * Add the counter to the list (if there is room) - * and check whether the total set is still feasible. - */ - cpuhw = &__get_cpu_var(cpu_hw_counters); - n0 = cpuhw->n_counters; - if (n0 >= ppmu->n_counter) - goto out; - cpuhw->counter[n0] = counter; - cpuhw->events[n0] = counter->hw.config; - cpuhw->flags[n0] = counter->hw.counter_base; - if (check_excludes(cpuhw->counter, cpuhw->flags, n0, 1)) - goto out; - if (power_check_constraints(cpuhw, cpuhw->events, cpuhw->flags, n0 + 1)) - goto out; - - counter->hw.config = cpuhw->events[n0]; - ++cpuhw->n_counters; - ++cpuhw->n_added; - - ret = 0; - out: - perf_enable(); - local_irq_restore(flags); - return ret; -} - -/* - * Remove a counter from the PMU. - */ -static void power_pmu_disable(struct perf_counter *counter) -{ - struct cpu_hw_counters *cpuhw; - long i; - unsigned long flags; - - local_irq_save(flags); - perf_disable(); - - power_pmu_read(counter); - - cpuhw = &__get_cpu_var(cpu_hw_counters); - for (i = 0; i < cpuhw->n_counters; ++i) { - if (counter == cpuhw->counter[i]) { - while (++i < cpuhw->n_counters) - cpuhw->counter[i-1] = cpuhw->counter[i]; - --cpuhw->n_counters; - ppmu->disable_pmc(counter->hw.idx - 1, cpuhw->mmcr); - if (counter->hw.idx) { - write_pmc(counter->hw.idx, 0); - counter->hw.idx = 0; - } - perf_counter_update_userpage(counter); - break; - } - } - for (i = 0; i < cpuhw->n_limited; ++i) - if (counter == cpuhw->limited_counter[i]) - break; - if (i < cpuhw->n_limited) { - while (++i < cpuhw->n_limited) { - cpuhw->limited_counter[i-1] = cpuhw->limited_counter[i]; - cpuhw->limited_hwidx[i-1] = cpuhw->limited_hwidx[i]; - } - --cpuhw->n_limited; - } - if (cpuhw->n_counters == 0) { - /* disable exceptions if no counters are running */ - cpuhw->mmcr[0] &= ~(MMCR0_PMXE | MMCR0_FCECE); - } - - perf_enable(); - local_irq_restore(flags); -} - -/* - * Re-enable interrupts on a counter after they were throttled - * because they were coming too fast. - */ -static void power_pmu_unthrottle(struct perf_counter *counter) -{ - s64 val, left; - unsigned long flags; - - if (!counter->hw.idx || !counter->hw.sample_period) - return; - local_irq_save(flags); - perf_disable(); - power_pmu_read(counter); - left = counter->hw.sample_period; - counter->hw.last_period = left; - val = 0; - if (left < 0x80000000L) - val = 0x80000000L - left; - write_pmc(counter->hw.idx, val); - atomic64_set(&counter->hw.prev_count, val); - atomic64_set(&counter->hw.period_left, left); - perf_counter_update_userpage(counter); - perf_enable(); - local_irq_restore(flags); -} - -struct pmu power_pmu = { - .enable = power_pmu_enable, - .disable = power_pmu_disable, - .read = power_pmu_read, - .unthrottle = power_pmu_unthrottle, -}; - -/* - * Return 1 if we might be able to put counter on a limited PMC, - * or 0 if not. - * A counter can only go on a limited PMC if it counts something - * that a limited PMC can count, doesn't require interrupts, and - * doesn't exclude any processor mode. - */ -static int can_go_on_limited_pmc(struct perf_counter *counter, u64 ev, - unsigned int flags) -{ - int n; - u64 alt[MAX_EVENT_ALTERNATIVES]; - - if (counter->attr.exclude_user - || counter->attr.exclude_kernel - || counter->attr.exclude_hv - || counter->attr.sample_period) - return 0; - - if (ppmu->limited_pmc_event(ev)) - return 1; - - /* - * The requested event isn't on a limited PMC already; - * see if any alternative code goes on a limited PMC. - */ - if (!ppmu->get_alternatives) - return 0; - - flags |= PPMU_LIMITED_PMC_OK | PPMU_LIMITED_PMC_REQD; - n = ppmu->get_alternatives(ev, flags, alt); - - return n > 0; -} - -/* - * Find an alternative event that goes on a normal PMC, if possible, - * and return the event code, or 0 if there is no such alternative. - * (Note: event code 0 is "don't count" on all machines.) - */ -static u64 normal_pmc_alternative(u64 ev, unsigned long flags) -{ - u64 alt[MAX_EVENT_ALTERNATIVES]; - int n; - - flags &= ~(PPMU_LIMITED_PMC_OK | PPMU_LIMITED_PMC_REQD); - n = ppmu->get_alternatives(ev, flags, alt); - if (!n) - return 0; - return alt[0]; -} - -/* Number of perf_counters counting hardware events */ -static atomic_t num_counters; -/* Used to avoid races in calling reserve/release_pmc_hardware */ -static DEFINE_MUTEX(pmc_reserve_mutex); - -/* - * Release the PMU if this is the last perf_counter. - */ -static void hw_perf_counter_destroy(struct perf_counter *counter) -{ - if (!atomic_add_unless(&num_counters, -1, 1)) { - mutex_lock(&pmc_reserve_mutex); - if (atomic_dec_return(&num_counters) == 0) - release_pmc_hardware(); - mutex_unlock(&pmc_reserve_mutex); - } -} - -/* - * Translate a generic cache event config to a raw event code. - */ -static int hw_perf_cache_event(u64 config, u64 *eventp) -{ - unsigned long type, op, result; - int ev; - - if (!ppmu->cache_events) - return -EINVAL; - - /* unpack config */ - type = config & 0xff; - op = (config >> 8) & 0xff; - result = (config >> 16) & 0xff; - - if (type >= PERF_COUNT_HW_CACHE_MAX || - op >= PERF_COUNT_HW_CACHE_OP_MAX || - result >= PERF_COUNT_HW_CACHE_RESULT_MAX) - return -EINVAL; - - ev = (*ppmu->cache_events)[type][op][result]; - if (ev == 0) - return -EOPNOTSUPP; - if (ev == -1) - return -EINVAL; - *eventp = ev; - return 0; -} - -const struct pmu *hw_perf_counter_init(struct perf_counter *counter) -{ - u64 ev; - unsigned long flags; - struct perf_counter *ctrs[MAX_HWCOUNTERS]; - u64 events[MAX_HWCOUNTERS]; - unsigned int cflags[MAX_HWCOUNTERS]; - int n; - int err; - struct cpu_hw_counters *cpuhw; - - if (!ppmu) - return ERR_PTR(-ENXIO); - switch (counter->attr.type) { - case PERF_TYPE_HARDWARE: - ev = counter->attr.config; - if (ev >= ppmu->n_generic || ppmu->generic_events[ev] == 0) - return ERR_PTR(-EOPNOTSUPP); - ev = ppmu->generic_events[ev]; - break; - case PERF_TYPE_HW_CACHE: - err = hw_perf_cache_event(counter->attr.config, &ev); - if (err) - return ERR_PTR(err); - break; - case PERF_TYPE_RAW: - ev = counter->attr.config; - break; - default: - return ERR_PTR(-EINVAL); - } - counter->hw.config_base = ev; - counter->hw.idx = 0; - - /* - * If we are not running on a hypervisor, force the - * exclude_hv bit to 0 so that we don't care what - * the user set it to. - */ - if (!firmware_has_feature(FW_FEATURE_LPAR)) - counter->attr.exclude_hv = 0; - - /* - * If this is a per-task counter, then we can use - * PM_RUN_* events interchangeably with their non RUN_* - * equivalents, e.g. PM_RUN_CYC instead of PM_CYC. - * XXX we should check if the task is an idle task. - */ - flags = 0; - if (counter->ctx->task) - flags |= PPMU_ONLY_COUNT_RUN; - - /* - * If this machine has limited counters, check whether this - * event could go on a limited counter. - */ - if (ppmu->flags & PPMU_LIMITED_PMC5_6) { - if (can_go_on_limited_pmc(counter, ev, flags)) { - flags |= PPMU_LIMITED_PMC_OK; - } else if (ppmu->limited_pmc_event(ev)) { - /* - * The requested event is on a limited PMC, - * but we can't use a limited PMC; see if any - * alternative goes on a normal PMC. - */ - ev = normal_pmc_alternative(ev, flags); - if (!ev) - return ERR_PTR(-EINVAL); - } - } - - /* - * If this is in a group, check if it can go on with all the - * other hardware counters in the group. We assume the counter - * hasn't been linked into its leader's sibling list at this point. - */ - n = 0; - if (counter->group_leader != counter) { - n = collect_events(counter->group_leader, ppmu->n_counter - 1, - ctrs, events, cflags); - if (n < 0) - return ERR_PTR(-EINVAL); - } - events[n] = ev; - ctrs[n] = counter; - cflags[n] = flags; - if (check_excludes(ctrs, cflags, n, 1)) - return ERR_PTR(-EINVAL); - - cpuhw = &get_cpu_var(cpu_hw_counters); - err = power_check_constraints(cpuhw, events, cflags, n + 1); - put_cpu_var(cpu_hw_counters); - if (err) - return ERR_PTR(-EINVAL); - - counter->hw.config = events[n]; - counter->hw.counter_base = cflags[n]; - counter->hw.last_period = counter->hw.sample_period; - atomic64_set(&counter->hw.period_left, counter->hw.last_period); - - /* - * See if we need to reserve the PMU. - * If no counters are currently in use, then we have to take a - * mutex to ensure that we don't race with another task doing - * reserve_pmc_hardware or release_pmc_hardware. - */ - err = 0; - if (!atomic_inc_not_zero(&num_counters)) { - mutex_lock(&pmc_reserve_mutex); - if (atomic_read(&num_counters) == 0 && - reserve_pmc_hardware(perf_counter_interrupt)) - err = -EBUSY; - else - atomic_inc(&num_counters); - mutex_unlock(&pmc_reserve_mutex); - } - counter->destroy = hw_perf_counter_destroy; - - if (err) - return ERR_PTR(err); - return &power_pmu; -} - -/* - * A counter has overflowed; update its count and record - * things if requested. Note that interrupts are hard-disabled - * here so there is no possibility of being interrupted. - */ -static void record_and_restart(struct perf_counter *counter, unsigned long val, - struct pt_regs *regs, int nmi) -{ - u64 period = counter->hw.sample_period; - s64 prev, delta, left; - int record = 0; - - /* we don't have to worry about interrupts here */ - prev = atomic64_read(&counter->hw.prev_count); - delta = (val - prev) & 0xfffffffful; - atomic64_add(delta, &counter->count); - - /* - * See if the total period for this counter has expired, - * and update for the next period. - */ - val = 0; - left = atomic64_read(&counter->hw.period_left) - delta; - if (period) { - if (left <= 0) { - left += period; - if (left <= 0) - left = period; - record = 1; - } - if (left < 0x80000000LL) - val = 0x80000000LL - left; - } - - /* - * Finally record data if requested. - */ - if (record) { - struct perf_sample_data data = { - .addr = 0, - .period = counter->hw.last_period, - }; - - if (counter->attr.sample_type & PERF_SAMPLE_ADDR) - perf_get_data_addr(regs, &data.addr); - - if (perf_counter_overflow(counter, nmi, &data, regs)) { - /* - * Interrupts are coming too fast - throttle them - * by setting the counter to 0, so it will be - * at least 2^30 cycles until the next interrupt - * (assuming each counter counts at most 2 counts - * per cycle). - */ - val = 0; - left = ~0ULL >> 1; - } - } - - write_pmc(counter->hw.idx, val); - atomic64_set(&counter->hw.prev_count, val); - atomic64_set(&counter->hw.period_left, left); - perf_counter_update_userpage(counter); -} - -/* - * Called from generic code to get the misc flags (i.e. processor mode) - * for an event. - */ -unsigned long perf_misc_flags(struct pt_regs *regs) -{ - u32 flags = perf_get_misc_flags(regs); - - if (flags) - return flags; - return user_mode(regs) ? PERF_EVENT_MISC_USER : - PERF_EVENT_MISC_KERNEL; -} - -/* - * Called from generic code to get the instruction pointer - * for an event. - */ -unsigned long perf_instruction_pointer(struct pt_regs *regs) -{ - unsigned long ip; - - if (TRAP(regs) != 0xf00) - return regs->nip; /* not a PMU interrupt */ - - ip = mfspr(SPRN_SIAR) + perf_ip_adjust(regs); - return ip; -} - -/* - * Performance monitor interrupt stuff - */ -static void perf_counter_interrupt(struct pt_regs *regs) -{ - int i; - struct cpu_hw_counters *cpuhw = &__get_cpu_var(cpu_hw_counters); - struct perf_counter *counter; - unsigned long val; - int found = 0; - int nmi; - - if (cpuhw->n_limited) - freeze_limited_counters(cpuhw, mfspr(SPRN_PMC5), - mfspr(SPRN_PMC6)); - - perf_read_regs(regs); - - nmi = perf_intr_is_nmi(regs); - if (nmi) - nmi_enter(); - else - irq_enter(); - - for (i = 0; i < cpuhw->n_counters; ++i) { - counter = cpuhw->counter[i]; - if (!counter->hw.idx || is_limited_pmc(counter->hw.idx)) - continue; - val = read_pmc(counter->hw.idx); - if ((int)val < 0) { - /* counter has overflowed */ - found = 1; - record_and_restart(counter, val, regs, nmi); - } - } - - /* - * In case we didn't find and reset the counter that caused - * the interrupt, scan all counters and reset any that are - * negative, to avoid getting continual interrupts. - * Any that we processed in the previous loop will not be negative. - */ - if (!found) { - for (i = 0; i < ppmu->n_counter; ++i) { - if (is_limited_pmc(i + 1)) - continue; - val = read_pmc(i + 1); - if ((int)val < 0) - write_pmc(i + 1, 0); - } - } - - /* - * Reset MMCR0 to its normal value. This will set PMXE and - * clear FC (freeze counters) and PMAO (perf mon alert occurred) - * and thus allow interrupts to occur again. - * XXX might want to use MSR.PM to keep the counters frozen until - * we get back out of this interrupt. - */ - write_mmcr0(cpuhw, cpuhw->mmcr[0]); - - if (nmi) - nmi_exit(); - else - irq_exit(); -} - -void hw_perf_counter_setup(int cpu) -{ - struct cpu_hw_counters *cpuhw = &per_cpu(cpu_hw_counters, cpu); - - if (!ppmu) - return; - memset(cpuhw, 0, sizeof(*cpuhw)); - cpuhw->mmcr[0] = MMCR0_FC; -} - -int register_power_pmu(struct power_pmu *pmu) -{ - if (ppmu) - return -EBUSY; /* something's already registered */ - - ppmu = pmu; - pr_info("%s performance monitor hardware support registered\n", - pmu->name); - -#ifdef MSR_HV - /* - * Use FCHV to ignore kernel events if MSR.HV is set. - */ - if (mfmsr() & MSR_HV) - freeze_counters_kernel = MMCR0_FCHV; -#endif /* CONFIG_PPC64 */ - - return 0; -} diff --git a/arch/powerpc/kernel/perf_event.c b/arch/powerpc/kernel/perf_event.c new file mode 100644 index 000000000000..c98321fcb459 --- /dev/null +++ b/arch/powerpc/kernel/perf_event.c @@ -0,0 +1,1315 @@ +/* + * Performance event support - powerpc architecture code + * + * Copyright 2008-2009 Paul Mackerras, IBM Corporation. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +struct cpu_hw_events { + int n_events; + int n_percpu; + int disabled; + int n_added; + int n_limited; + u8 pmcs_enabled; + struct perf_event *event[MAX_HWEVENTS]; + u64 events[MAX_HWEVENTS]; + unsigned int flags[MAX_HWEVENTS]; + unsigned long mmcr[3]; + struct perf_event *limited_event[MAX_LIMITED_HWEVENTS]; + u8 limited_hwidx[MAX_LIMITED_HWEVENTS]; + u64 alternatives[MAX_HWEVENTS][MAX_EVENT_ALTERNATIVES]; + unsigned long amasks[MAX_HWEVENTS][MAX_EVENT_ALTERNATIVES]; + unsigned long avalues[MAX_HWEVENTS][MAX_EVENT_ALTERNATIVES]; +}; +DEFINE_PER_CPU(struct cpu_hw_events, cpu_hw_events); + +struct power_pmu *ppmu; + +/* + * Normally, to ignore kernel events we set the FCS (freeze events + * in supervisor mode) bit in MMCR0, but if the kernel runs with the + * hypervisor bit set in the MSR, or if we are running on a processor + * where the hypervisor bit is forced to 1 (as on Apple G5 processors), + * then we need to use the FCHV bit to ignore kernel events. + */ +static unsigned int freeze_events_kernel = MMCR0_FCS; + +/* + * 32-bit doesn't have MMCRA but does have an MMCR2, + * and a few other names are different. + */ +#ifdef CONFIG_PPC32 + +#define MMCR0_FCHV 0 +#define MMCR0_PMCjCE MMCR0_PMCnCE + +#define SPRN_MMCRA SPRN_MMCR2 +#define MMCRA_SAMPLE_ENABLE 0 + +static inline unsigned long perf_ip_adjust(struct pt_regs *regs) +{ + return 0; +} +static inline void perf_get_data_addr(struct pt_regs *regs, u64 *addrp) { } +static inline u32 perf_get_misc_flags(struct pt_regs *regs) +{ + return 0; +} +static inline void perf_read_regs(struct pt_regs *regs) { } +static inline int perf_intr_is_nmi(struct pt_regs *regs) +{ + return 0; +} + +#endif /* CONFIG_PPC32 */ + +/* + * Things that are specific to 64-bit implementations. + */ +#ifdef CONFIG_PPC64 + +static inline unsigned long perf_ip_adjust(struct pt_regs *regs) +{ + unsigned long mmcra = regs->dsisr; + + if ((mmcra & MMCRA_SAMPLE_ENABLE) && !(ppmu->flags & PPMU_ALT_SIPR)) { + unsigned long slot = (mmcra & MMCRA_SLOT) >> MMCRA_SLOT_SHIFT; + if (slot > 1) + return 4 * (slot - 1); + } + return 0; +} + +/* + * The user wants a data address recorded. + * If we're not doing instruction sampling, give them the SDAR + * (sampled data address). If we are doing instruction sampling, then + * only give them the SDAR if it corresponds to the instruction + * pointed to by SIAR; this is indicated by the [POWER6_]MMCRA_SDSYNC + * bit in MMCRA. + */ +static inline void perf_get_data_addr(struct pt_regs *regs, u64 *addrp) +{ + unsigned long mmcra = regs->dsisr; + unsigned long sdsync = (ppmu->flags & PPMU_ALT_SIPR) ? + POWER6_MMCRA_SDSYNC : MMCRA_SDSYNC; + + if (!(mmcra & MMCRA_SAMPLE_ENABLE) || (mmcra & sdsync)) + *addrp = mfspr(SPRN_SDAR); +} + +static inline u32 perf_get_misc_flags(struct pt_regs *regs) +{ + unsigned long mmcra = regs->dsisr; + + if (TRAP(regs) != 0xf00) + return 0; /* not a PMU interrupt */ + + if (ppmu->flags & PPMU_ALT_SIPR) { + if (mmcra & POWER6_MMCRA_SIHV) + return PERF_RECORD_MISC_HYPERVISOR; + return (mmcra & POWER6_MMCRA_SIPR) ? + PERF_RECORD_MISC_USER : PERF_RECORD_MISC_KERNEL; + } + if (mmcra & MMCRA_SIHV) + return PERF_RECORD_MISC_HYPERVISOR; + return (mmcra & MMCRA_SIPR) ? PERF_RECORD_MISC_USER : + PERF_RECORD_MISC_KERNEL; +} + +/* + * Overload regs->dsisr to store MMCRA so we only need to read it once + * on each interrupt. + */ +static inline void perf_read_regs(struct pt_regs *regs) +{ + regs->dsisr = mfspr(SPRN_MMCRA); +} + +/* + * If interrupts were soft-disabled when a PMU interrupt occurs, treat + * it as an NMI. + */ +static inline int perf_intr_is_nmi(struct pt_regs *regs) +{ + return !regs->softe; +} + +#endif /* CONFIG_PPC64 */ + +static void perf_event_interrupt(struct pt_regs *regs); + +void perf_event_print_debug(void) +{ +} + +/* + * Read one performance monitor event (PMC). + */ +static unsigned long read_pmc(int idx) +{ + unsigned long val; + + switch (idx) { + case 1: + val = mfspr(SPRN_PMC1); + break; + case 2: + val = mfspr(SPRN_PMC2); + break; + case 3: + val = mfspr(SPRN_PMC3); + break; + case 4: + val = mfspr(SPRN_PMC4); + break; + case 5: + val = mfspr(SPRN_PMC5); + break; + case 6: + val = mfspr(SPRN_PMC6); + break; +#ifdef CONFIG_PPC64 + case 7: + val = mfspr(SPRN_PMC7); + break; + case 8: + val = mfspr(SPRN_PMC8); + break; +#endif /* CONFIG_PPC64 */ + default: + printk(KERN_ERR "oops trying to read PMC%d\n", idx); + val = 0; + } + return val; +} + +/* + * Write one PMC. + */ +static void write_pmc(int idx, unsigned long val) +{ + switch (idx) { + case 1: + mtspr(SPRN_PMC1, val); + break; + case 2: + mtspr(SPRN_PMC2, val); + break; + case 3: + mtspr(SPRN_PMC3, val); + break; + case 4: + mtspr(SPRN_PMC4, val); + break; + case 5: + mtspr(SPRN_PMC5, val); + break; + case 6: + mtspr(SPRN_PMC6, val); + break; +#ifdef CONFIG_PPC64 + case 7: + mtspr(SPRN_PMC7, val); + break; + case 8: + mtspr(SPRN_PMC8, val); + break; +#endif /* CONFIG_PPC64 */ + default: + printk(KERN_ERR "oops trying to write PMC%d\n", idx); + } +} + +/* + * Check if a set of events can all go on the PMU at once. + * If they can't, this will look at alternative codes for the events + * and see if any combination of alternative codes is feasible. + * The feasible set is returned in event_id[]. + */ +static int power_check_constraints(struct cpu_hw_events *cpuhw, + u64 event_id[], unsigned int cflags[], + int n_ev) +{ + unsigned long mask, value, nv; + unsigned long smasks[MAX_HWEVENTS], svalues[MAX_HWEVENTS]; + int n_alt[MAX_HWEVENTS], choice[MAX_HWEVENTS]; + int i, j; + unsigned long addf = ppmu->add_fields; + unsigned long tadd = ppmu->test_adder; + + if (n_ev > ppmu->n_event) + return -1; + + /* First see if the events will go on as-is */ + for (i = 0; i < n_ev; ++i) { + if ((cflags[i] & PPMU_LIMITED_PMC_REQD) + && !ppmu->limited_pmc_event(event_id[i])) { + ppmu->get_alternatives(event_id[i], cflags[i], + cpuhw->alternatives[i]); + event_id[i] = cpuhw->alternatives[i][0]; + } + if (ppmu->get_constraint(event_id[i], &cpuhw->amasks[i][0], + &cpuhw->avalues[i][0])) + return -1; + } + value = mask = 0; + for (i = 0; i < n_ev; ++i) { + nv = (value | cpuhw->avalues[i][0]) + + (value & cpuhw->avalues[i][0] & addf); + if ((((nv + tadd) ^ value) & mask) != 0 || + (((nv + tadd) ^ cpuhw->avalues[i][0]) & + cpuhw->amasks[i][0]) != 0) + break; + value = nv; + mask |= cpuhw->amasks[i][0]; + } + if (i == n_ev) + return 0; /* all OK */ + + /* doesn't work, gather alternatives... */ + if (!ppmu->get_alternatives) + return -1; + for (i = 0; i < n_ev; ++i) { + choice[i] = 0; + n_alt[i] = ppmu->get_alternatives(event_id[i], cflags[i], + cpuhw->alternatives[i]); + for (j = 1; j < n_alt[i]; ++j) + ppmu->get_constraint(cpuhw->alternatives[i][j], + &cpuhw->amasks[i][j], + &cpuhw->avalues[i][j]); + } + + /* enumerate all possibilities and see if any will work */ + i = 0; + j = -1; + value = mask = nv = 0; + while (i < n_ev) { + if (j >= 0) { + /* we're backtracking, restore context */ + value = svalues[i]; + mask = smasks[i]; + j = choice[i]; + } + /* + * See if any alternative k for event_id i, + * where k > j, will satisfy the constraints. + */ + while (++j < n_alt[i]) { + nv = (value | cpuhw->avalues[i][j]) + + (value & cpuhw->avalues[i][j] & addf); + if ((((nv + tadd) ^ value) & mask) == 0 && + (((nv + tadd) ^ cpuhw->avalues[i][j]) + & cpuhw->amasks[i][j]) == 0) + break; + } + if (j >= n_alt[i]) { + /* + * No feasible alternative, backtrack + * to event_id i-1 and continue enumerating its + * alternatives from where we got up to. + */ + if (--i < 0) + return -1; + } else { + /* + * Found a feasible alternative for event_id i, + * remember where we got up to with this event_id, + * go on to the next event_id, and start with + * the first alternative for it. + */ + choice[i] = j; + svalues[i] = value; + smasks[i] = mask; + value = nv; + mask |= cpuhw->amasks[i][j]; + ++i; + j = -1; + } + } + + /* OK, we have a feasible combination, tell the caller the solution */ + for (i = 0; i < n_ev; ++i) + event_id[i] = cpuhw->alternatives[i][choice[i]]; + return 0; +} + +/* + * Check if newly-added events have consistent settings for + * exclude_{user,kernel,hv} with each other and any previously + * added events. + */ +static int check_excludes(struct perf_event **ctrs, unsigned int cflags[], + int n_prev, int n_new) +{ + int eu = 0, ek = 0, eh = 0; + int i, n, first; + struct perf_event *event; + + n = n_prev + n_new; + if (n <= 1) + return 0; + + first = 1; + for (i = 0; i < n; ++i) { + if (cflags[i] & PPMU_LIMITED_PMC_OK) { + cflags[i] &= ~PPMU_LIMITED_PMC_REQD; + continue; + } + event = ctrs[i]; + if (first) { + eu = event->attr.exclude_user; + ek = event->attr.exclude_kernel; + eh = event->attr.exclude_hv; + first = 0; + } else if (event->attr.exclude_user != eu || + event->attr.exclude_kernel != ek || + event->attr.exclude_hv != eh) { + return -EAGAIN; + } + } + + if (eu || ek || eh) + for (i = 0; i < n; ++i) + if (cflags[i] & PPMU_LIMITED_PMC_OK) + cflags[i] |= PPMU_LIMITED_PMC_REQD; + + return 0; +} + +static void power_pmu_read(struct perf_event *event) +{ + s64 val, delta, prev; + + if (!event->hw.idx) + return; + /* + * Performance monitor interrupts come even when interrupts + * are soft-disabled, as long as interrupts are hard-enabled. + * Therefore we treat them like NMIs. + */ + do { + prev = atomic64_read(&event->hw.prev_count); + barrier(); + val = read_pmc(event->hw.idx); + } while (atomic64_cmpxchg(&event->hw.prev_count, prev, val) != prev); + + /* The events are only 32 bits wide */ + delta = (val - prev) & 0xfffffffful; + atomic64_add(delta, &event->count); + atomic64_sub(delta, &event->hw.period_left); +} + +/* + * On some machines, PMC5 and PMC6 can't be written, don't respect + * the freeze conditions, and don't generate interrupts. This tells + * us if `event' is using such a PMC. + */ +static int is_limited_pmc(int pmcnum) +{ + return (ppmu->flags & PPMU_LIMITED_PMC5_6) + && (pmcnum == 5 || pmcnum == 6); +} + +static void freeze_limited_events(struct cpu_hw_events *cpuhw, + unsigned long pmc5, unsigned long pmc6) +{ + struct perf_event *event; + u64 val, prev, delta; + int i; + + for (i = 0; i < cpuhw->n_limited; ++i) { + event = cpuhw->limited_event[i]; + if (!event->hw.idx) + continue; + val = (event->hw.idx == 5) ? pmc5 : pmc6; + prev = atomic64_read(&event->hw.prev_count); + event->hw.idx = 0; + delta = (val - prev) & 0xfffffffful; + atomic64_add(delta, &event->count); + } +} + +static void thaw_limited_events(struct cpu_hw_events *cpuhw, + unsigned long pmc5, unsigned long pmc6) +{ + struct perf_event *event; + u64 val; + int i; + + for (i = 0; i < cpuhw->n_limited; ++i) { + event = cpuhw->limited_event[i]; + event->hw.idx = cpuhw->limited_hwidx[i]; + val = (event->hw.idx == 5) ? pmc5 : pmc6; + atomic64_set(&event->hw.prev_count, val); + perf_event_update_userpage(event); + } +} + +/* + * Since limited events don't respect the freeze conditions, we + * have to read them immediately after freezing or unfreezing the + * other events. We try to keep the values from the limited + * events as consistent as possible by keeping the delay (in + * cycles and instructions) between freezing/unfreezing and reading + * the limited events as small and consistent as possible. + * Therefore, if any limited events are in use, we read them + * both, and always in the same order, to minimize variability, + * and do it inside the same asm that writes MMCR0. + */ +static void write_mmcr0(struct cpu_hw_events *cpuhw, unsigned long mmcr0) +{ + unsigned long pmc5, pmc6; + + if (!cpuhw->n_limited) { + mtspr(SPRN_MMCR0, mmcr0); + return; + } + + /* + * Write MMCR0, then read PMC5 and PMC6 immediately. + * To ensure we don't get a performance monitor interrupt + * between writing MMCR0 and freezing/thawing the limited + * events, we first write MMCR0 with the event overflow + * interrupt enable bits turned off. + */ + asm volatile("mtspr %3,%2; mfspr %0,%4; mfspr %1,%5" + : "=&r" (pmc5), "=&r" (pmc6) + : "r" (mmcr0 & ~(MMCR0_PMC1CE | MMCR0_PMCjCE)), + "i" (SPRN_MMCR0), + "i" (SPRN_PMC5), "i" (SPRN_PMC6)); + + if (mmcr0 & MMCR0_FC) + freeze_limited_events(cpuhw, pmc5, pmc6); + else + thaw_limited_events(cpuhw, pmc5, pmc6); + + /* + * Write the full MMCR0 including the event overflow interrupt + * enable bits, if necessary. + */ + if (mmcr0 & (MMCR0_PMC1CE | MMCR0_PMCjCE)) + mtspr(SPRN_MMCR0, mmcr0); +} + +/* + * Disable all events to prevent PMU interrupts and to allow + * events to be added or removed. + */ +void hw_perf_disable(void) +{ + struct cpu_hw_events *cpuhw; + unsigned long flags; + + if (!ppmu) + return; + local_irq_save(flags); + cpuhw = &__get_cpu_var(cpu_hw_events); + + if (!cpuhw->disabled) { + cpuhw->disabled = 1; + cpuhw->n_added = 0; + + /* + * Check if we ever enabled the PMU on this cpu. + */ + if (!cpuhw->pmcs_enabled) { + ppc_enable_pmcs(); + cpuhw->pmcs_enabled = 1; + } + + /* + * Disable instruction sampling if it was enabled + */ + if (cpuhw->mmcr[2] & MMCRA_SAMPLE_ENABLE) { + mtspr(SPRN_MMCRA, + cpuhw->mmcr[2] & ~MMCRA_SAMPLE_ENABLE); + mb(); + } + + /* + * Set the 'freeze events' bit. + * The barrier is to make sure the mtspr has been + * executed and the PMU has frozen the events + * before we return. + */ + write_mmcr0(cpuhw, mfspr(SPRN_MMCR0) | MMCR0_FC); + mb(); + } + local_irq_restore(flags); +} + +/* + * Re-enable all events if disable == 0. + * If we were previously disabled and events were added, then + * put the new config on the PMU. + */ +void hw_perf_enable(void) +{ + struct perf_event *event; + struct cpu_hw_events *cpuhw; + unsigned long flags; + long i; + unsigned long val; + s64 left; + unsigned int hwc_index[MAX_HWEVENTS]; + int n_lim; + int idx; + + if (!ppmu) + return; + local_irq_save(flags); + cpuhw = &__get_cpu_var(cpu_hw_events); + if (!cpuhw->disabled) { + local_irq_restore(flags); + return; + } + cpuhw->disabled = 0; + + /* + * If we didn't change anything, or only removed events, + * no need to recalculate MMCR* settings and reset the PMCs. + * Just reenable the PMU with the current MMCR* settings + * (possibly updated for removal of events). + */ + if (!cpuhw->n_added) { + mtspr(SPRN_MMCRA, cpuhw->mmcr[2] & ~MMCRA_SAMPLE_ENABLE); + mtspr(SPRN_MMCR1, cpuhw->mmcr[1]); + if (cpuhw->n_events == 0) + ppc_set_pmu_inuse(0); + goto out_enable; + } + + /* + * Compute MMCR* values for the new set of events + */ + if (ppmu->compute_mmcr(cpuhw->events, cpuhw->n_events, hwc_index, + cpuhw->mmcr)) { + /* shouldn't ever get here */ + printk(KERN_ERR "oops compute_mmcr failed\n"); + goto out; + } + + /* + * Add in MMCR0 freeze bits corresponding to the + * attr.exclude_* bits for the first event. + * We have already checked that all events have the + * same values for these bits as the first event. + */ + event = cpuhw->event[0]; + if (event->attr.exclude_user) + cpuhw->mmcr[0] |= MMCR0_FCP; + if (event->attr.exclude_kernel) + cpuhw->mmcr[0] |= freeze_events_kernel; + if (event->attr.exclude_hv) + cpuhw->mmcr[0] |= MMCR0_FCHV; + + /* + * Write the new configuration to MMCR* with the freeze + * bit set and set the hardware events to their initial values. + * Then unfreeze the events. + */ + ppc_set_pmu_inuse(1); + mtspr(SPRN_MMCRA, cpuhw->mmcr[2] & ~MMCRA_SAMPLE_ENABLE); + mtspr(SPRN_MMCR1, cpuhw->mmcr[1]); + mtspr(SPRN_MMCR0, (cpuhw->mmcr[0] & ~(MMCR0_PMC1CE | MMCR0_PMCjCE)) + | MMCR0_FC); + + /* + * Read off any pre-existing events that need to move + * to another PMC. + */ + for (i = 0; i < cpuhw->n_events; ++i) { + event = cpuhw->event[i]; + if (event->hw.idx && event->hw.idx != hwc_index[i] + 1) { + power_pmu_read(event); + write_pmc(event->hw.idx, 0); + event->hw.idx = 0; + } + } + + /* + * Initialize the PMCs for all the new and moved events. + */ + cpuhw->n_limited = n_lim = 0; + for (i = 0; i < cpuhw->n_events; ++i) { + event = cpuhw->event[i]; + if (event->hw.idx) + continue; + idx = hwc_index[i] + 1; + if (is_limited_pmc(idx)) { + cpuhw->limited_event[n_lim] = event; + cpuhw->limited_hwidx[n_lim] = idx; + ++n_lim; + continue; + } + val = 0; + if (event->hw.sample_period) { + left = atomic64_read(&event->hw.period_left); + if (left < 0x80000000L) + val = 0x80000000L - left; + } + atomic64_set(&event->hw.prev_count, val); + event->hw.idx = idx; + write_pmc(idx, val); + perf_event_update_userpage(event); + } + cpuhw->n_limited = n_lim; + cpuhw->mmcr[0] |= MMCR0_PMXE | MMCR0_FCECE; + + out_enable: + mb(); + write_mmcr0(cpuhw, cpuhw->mmcr[0]); + + /* + * Enable instruction sampling if necessary + */ + if (cpuhw->mmcr[2] & MMCRA_SAMPLE_ENABLE) { + mb(); + mtspr(SPRN_MMCRA, cpuhw->mmcr[2]); + } + + out: + local_irq_restore(flags); +} + +static int collect_events(struct perf_event *group, int max_count, + struct perf_event *ctrs[], u64 *events, + unsigned int *flags) +{ + int n = 0; + struct perf_event *event; + + if (!is_software_event(group)) { + if (n >= max_count) + return -1; + ctrs[n] = group; + flags[n] = group->hw.event_base; + events[n++] = group->hw.config; + } + list_for_each_entry(event, &group->sibling_list, list_entry) { + if (!is_software_event(event) && + event->state != PERF_EVENT_STATE_OFF) { + if (n >= max_count) + return -1; + ctrs[n] = event; + flags[n] = event->hw.event_base; + events[n++] = event->hw.config; + } + } + return n; +} + +static void event_sched_in(struct perf_event *event, int cpu) +{ + event->state = PERF_EVENT_STATE_ACTIVE; + event->oncpu = cpu; + event->tstamp_running += event->ctx->time - event->tstamp_stopped; + if (is_software_event(event)) + event->pmu->enable(event); +} + +/* + * Called to enable a whole group of events. + * Returns 1 if the group was enabled, or -EAGAIN if it could not be. + * Assumes the caller has disabled interrupts and has + * frozen the PMU with hw_perf_save_disable. + */ +int hw_perf_group_sched_in(struct perf_event *group_leader, + struct perf_cpu_context *cpuctx, + struct perf_event_context *ctx, int cpu) +{ + struct cpu_hw_events *cpuhw; + long i, n, n0; + struct perf_event *sub; + + if (!ppmu) + return 0; + cpuhw = &__get_cpu_var(cpu_hw_events); + n0 = cpuhw->n_events; + n = collect_events(group_leader, ppmu->n_event - n0, + &cpuhw->event[n0], &cpuhw->events[n0], + &cpuhw->flags[n0]); + if (n < 0) + return -EAGAIN; + if (check_excludes(cpuhw->event, cpuhw->flags, n0, n)) + return -EAGAIN; + i = power_check_constraints(cpuhw, cpuhw->events, cpuhw->flags, n + n0); + if (i < 0) + return -EAGAIN; + cpuhw->n_events = n0 + n; + cpuhw->n_added += n; + + /* + * OK, this group can go on; update event states etc., + * and enable any software events + */ + for (i = n0; i < n0 + n; ++i) + cpuhw->event[i]->hw.config = cpuhw->events[i]; + cpuctx->active_oncpu += n; + n = 1; + event_sched_in(group_leader, cpu); + list_for_each_entry(sub, &group_leader->sibling_list, list_entry) { + if (sub->state != PERF_EVENT_STATE_OFF) { + event_sched_in(sub, cpu); + ++n; + } + } + ctx->nr_active += n; + + return 1; +} + +/* + * Add a event to the PMU. + * If all events are not already frozen, then we disable and + * re-enable the PMU in order to get hw_perf_enable to do the + * actual work of reconfiguring the PMU. + */ +static int power_pmu_enable(struct perf_event *event) +{ + struct cpu_hw_events *cpuhw; + unsigned long flags; + int n0; + int ret = -EAGAIN; + + local_irq_save(flags); + perf_disable(); + + /* + * Add the event to the list (if there is room) + * and check whether the total set is still feasible. + */ + cpuhw = &__get_cpu_var(cpu_hw_events); + n0 = cpuhw->n_events; + if (n0 >= ppmu->n_event) + goto out; + cpuhw->event[n0] = event; + cpuhw->events[n0] = event->hw.config; + cpuhw->flags[n0] = event->hw.event_base; + if (check_excludes(cpuhw->event, cpuhw->flags, n0, 1)) + goto out; + if (power_check_constraints(cpuhw, cpuhw->events, cpuhw->flags, n0 + 1)) + goto out; + + event->hw.config = cpuhw->events[n0]; + ++cpuhw->n_events; + ++cpuhw->n_added; + + ret = 0; + out: + perf_enable(); + local_irq_restore(flags); + return ret; +} + +/* + * Remove a event from the PMU. + */ +static void power_pmu_disable(struct perf_event *event) +{ + struct cpu_hw_events *cpuhw; + long i; + unsigned long flags; + + local_irq_save(flags); + perf_disable(); + + power_pmu_read(event); + + cpuhw = &__get_cpu_var(cpu_hw_events); + for (i = 0; i < cpuhw->n_events; ++i) { + if (event == cpuhw->event[i]) { + while (++i < cpuhw->n_events) + cpuhw->event[i-1] = cpuhw->event[i]; + --cpuhw->n_events; + ppmu->disable_pmc(event->hw.idx - 1, cpuhw->mmcr); + if (event->hw.idx) { + write_pmc(event->hw.idx, 0); + event->hw.idx = 0; + } + perf_event_update_userpage(event); + break; + } + } + for (i = 0; i < cpuhw->n_limited; ++i) + if (event == cpuhw->limited_event[i]) + break; + if (i < cpuhw->n_limited) { + while (++i < cpuhw->n_limited) { + cpuhw->limited_event[i-1] = cpuhw->limited_event[i]; + cpuhw->limited_hwidx[i-1] = cpuhw->limited_hwidx[i]; + } + --cpuhw->n_limited; + } + if (cpuhw->n_events == 0) { + /* disable exceptions if no events are running */ + cpuhw->mmcr[0] &= ~(MMCR0_PMXE | MMCR0_FCECE); + } + + perf_enable(); + local_irq_restore(flags); +} + +/* + * Re-enable interrupts on a event after they were throttled + * because they were coming too fast. + */ +static void power_pmu_unthrottle(struct perf_event *event) +{ + s64 val, left; + unsigned long flags; + + if (!event->hw.idx || !event->hw.sample_period) + return; + local_irq_save(flags); + perf_disable(); + power_pmu_read(event); + left = event->hw.sample_period; + event->hw.last_period = left; + val = 0; + if (left < 0x80000000L) + val = 0x80000000L - left; + write_pmc(event->hw.idx, val); + atomic64_set(&event->hw.prev_count, val); + atomic64_set(&event->hw.period_left, left); + perf_event_update_userpage(event); + perf_enable(); + local_irq_restore(flags); +} + +struct pmu power_pmu = { + .enable = power_pmu_enable, + .disable = power_pmu_disable, + .read = power_pmu_read, + .unthrottle = power_pmu_unthrottle, +}; + +/* + * Return 1 if we might be able to put event on a limited PMC, + * or 0 if not. + * A event can only go on a limited PMC if it counts something + * that a limited PMC can count, doesn't require interrupts, and + * doesn't exclude any processor mode. + */ +static int can_go_on_limited_pmc(struct perf_event *event, u64 ev, + unsigned int flags) +{ + int n; + u64 alt[MAX_EVENT_ALTERNATIVES]; + + if (event->attr.exclude_user + || event->attr.exclude_kernel + || event->attr.exclude_hv + || event->attr.sample_period) + return 0; + + if (ppmu->limited_pmc_event(ev)) + return 1; + + /* + * The requested event_id isn't on a limited PMC already; + * see if any alternative code goes on a limited PMC. + */ + if (!ppmu->get_alternatives) + return 0; + + flags |= PPMU_LIMITED_PMC_OK | PPMU_LIMITED_PMC_REQD; + n = ppmu->get_alternatives(ev, flags, alt); + + return n > 0; +} + +/* + * Find an alternative event_id that goes on a normal PMC, if possible, + * and return the event_id code, or 0 if there is no such alternative. + * (Note: event_id code 0 is "don't count" on all machines.) + */ +static u64 normal_pmc_alternative(u64 ev, unsigned long flags) +{ + u64 alt[MAX_EVENT_ALTERNATIVES]; + int n; + + flags &= ~(PPMU_LIMITED_PMC_OK | PPMU_LIMITED_PMC_REQD); + n = ppmu->get_alternatives(ev, flags, alt); + if (!n) + return 0; + return alt[0]; +} + +/* Number of perf_events counting hardware events */ +static atomic_t num_events; +/* Used to avoid races in calling reserve/release_pmc_hardware */ +static DEFINE_MUTEX(pmc_reserve_mutex); + +/* + * Release the PMU if this is the last perf_event. + */ +static void hw_perf_event_destroy(struct perf_event *event) +{ + if (!atomic_add_unless(&num_events, -1, 1)) { + mutex_lock(&pmc_reserve_mutex); + if (atomic_dec_return(&num_events) == 0) + release_pmc_hardware(); + mutex_unlock(&pmc_reserve_mutex); + } +} + +/* + * Translate a generic cache event_id config to a raw event_id code. + */ +static int hw_perf_cache_event(u64 config, u64 *eventp) +{ + unsigned long type, op, result; + int ev; + + if (!ppmu->cache_events) + return -EINVAL; + + /* unpack config */ + type = config & 0xff; + op = (config >> 8) & 0xff; + result = (config >> 16) & 0xff; + + if (type >= PERF_COUNT_HW_CACHE_MAX || + op >= PERF_COUNT_HW_CACHE_OP_MAX || + result >= PERF_COUNT_HW_CACHE_RESULT_MAX) + return -EINVAL; + + ev = (*ppmu->cache_events)[type][op][result]; + if (ev == 0) + return -EOPNOTSUPP; + if (ev == -1) + return -EINVAL; + *eventp = ev; + return 0; +} + +const struct pmu *hw_perf_event_init(struct perf_event *event) +{ + u64 ev; + unsigned long flags; + struct perf_event *ctrs[MAX_HWEVENTS]; + u64 events[MAX_HWEVENTS]; + unsigned int cflags[MAX_HWEVENTS]; + int n; + int err; + struct cpu_hw_events *cpuhw; + + if (!ppmu) + return ERR_PTR(-ENXIO); + switch (event->attr.type) { + case PERF_TYPE_HARDWARE: + ev = event->attr.config; + if (ev >= ppmu->n_generic || ppmu->generic_events[ev] == 0) + return ERR_PTR(-EOPNOTSUPP); + ev = ppmu->generic_events[ev]; + break; + case PERF_TYPE_HW_CACHE: + err = hw_perf_cache_event(event->attr.config, &ev); + if (err) + return ERR_PTR(err); + break; + case PERF_TYPE_RAW: + ev = event->attr.config; + break; + default: + return ERR_PTR(-EINVAL); + } + event->hw.config_base = ev; + event->hw.idx = 0; + + /* + * If we are not running on a hypervisor, force the + * exclude_hv bit to 0 so that we don't care what + * the user set it to. + */ + if (!firmware_has_feature(FW_FEATURE_LPAR)) + event->attr.exclude_hv = 0; + + /* + * If this is a per-task event, then we can use + * PM_RUN_* events interchangeably with their non RUN_* + * equivalents, e.g. PM_RUN_CYC instead of PM_CYC. + * XXX we should check if the task is an idle task. + */ + flags = 0; + if (event->ctx->task) + flags |= PPMU_ONLY_COUNT_RUN; + + /* + * If this machine has limited events, check whether this + * event_id could go on a limited event. + */ + if (ppmu->flags & PPMU_LIMITED_PMC5_6) { + if (can_go_on_limited_pmc(event, ev, flags)) { + flags |= PPMU_LIMITED_PMC_OK; + } else if (ppmu->limited_pmc_event(ev)) { + /* + * The requested event_id is on a limited PMC, + * but we can't use a limited PMC; see if any + * alternative goes on a normal PMC. + */ + ev = normal_pmc_alternative(ev, flags); + if (!ev) + return ERR_PTR(-EINVAL); + } + } + + /* + * If this is in a group, check if it can go on with all the + * other hardware events in the group. We assume the event + * hasn't been linked into its leader's sibling list at this point. + */ + n = 0; + if (event->group_leader != event) { + n = collect_events(event->group_leader, ppmu->n_event - 1, + ctrs, events, cflags); + if (n < 0) + return ERR_PTR(-EINVAL); + } + events[n] = ev; + ctrs[n] = event; + cflags[n] = flags; + if (check_excludes(ctrs, cflags, n, 1)) + return ERR_PTR(-EINVAL); + + cpuhw = &get_cpu_var(cpu_hw_events); + err = power_check_constraints(cpuhw, events, cflags, n + 1); + put_cpu_var(cpu_hw_events); + if (err) + return ERR_PTR(-EINVAL); + + event->hw.config = events[n]; + event->hw.event_base = cflags[n]; + event->hw.last_period = event->hw.sample_period; + atomic64_set(&event->hw.period_left, event->hw.last_period); + + /* + * See if we need to reserve the PMU. + * If no events are currently in use, then we have to take a + * mutex to ensure that we don't race with another task doing + * reserve_pmc_hardware or release_pmc_hardware. + */ + err = 0; + if (!atomic_inc_not_zero(&num_events)) { + mutex_lock(&pmc_reserve_mutex); + if (atomic_read(&num_events) == 0 && + reserve_pmc_hardware(perf_event_interrupt)) + err = -EBUSY; + else + atomic_inc(&num_events); + mutex_unlock(&pmc_reserve_mutex); + } + event->destroy = hw_perf_event_destroy; + + if (err) + return ERR_PTR(err); + return &power_pmu; +} + +/* + * A event has overflowed; update its count and record + * things if requested. Note that interrupts are hard-disabled + * here so there is no possibility of being interrupted. + */ +static void record_and_restart(struct perf_event *event, unsigned long val, + struct pt_regs *regs, int nmi) +{ + u64 period = event->hw.sample_period; + s64 prev, delta, left; + int record = 0; + + /* we don't have to worry about interrupts here */ + prev = atomic64_read(&event->hw.prev_count); + delta = (val - prev) & 0xfffffffful; + atomic64_add(delta, &event->count); + + /* + * See if the total period for this event has expired, + * and update for the next period. + */ + val = 0; + left = atomic64_read(&event->hw.period_left) - delta; + if (period) { + if (left <= 0) { + left += period; + if (left <= 0) + left = period; + record = 1; + } + if (left < 0x80000000LL) + val = 0x80000000LL - left; + } + + /* + * Finally record data if requested. + */ + if (record) { + struct perf_sample_data data = { + .addr = 0, + .period = event->hw.last_period, + }; + + if (event->attr.sample_type & PERF_SAMPLE_ADDR) + perf_get_data_addr(regs, &data.addr); + + if (perf_event_overflow(event, nmi, &data, regs)) { + /* + * Interrupts are coming too fast - throttle them + * by setting the event to 0, so it will be + * at least 2^30 cycles until the next interrupt + * (assuming each event counts at most 2 counts + * per cycle). + */ + val = 0; + left = ~0ULL >> 1; + } + } + + write_pmc(event->hw.idx, val); + atomic64_set(&event->hw.prev_count, val); + atomic64_set(&event->hw.period_left, left); + perf_event_update_userpage(event); +} + +/* + * Called from generic code to get the misc flags (i.e. processor mode) + * for an event_id. + */ +unsigned long perf_misc_flags(struct pt_regs *regs) +{ + u32 flags = perf_get_misc_flags(regs); + + if (flags) + return flags; + return user_mode(regs) ? PERF_RECORD_MISC_USER : + PERF_RECORD_MISC_KERNEL; +} + +/* + * Called from generic code to get the instruction pointer + * for an event_id. + */ +unsigned long perf_instruction_pointer(struct pt_regs *regs) +{ + unsigned long ip; + + if (TRAP(regs) != 0xf00) + return regs->nip; /* not a PMU interrupt */ + + ip = mfspr(SPRN_SIAR) + perf_ip_adjust(regs); + return ip; +} + +/* + * Performance monitor interrupt stuff + */ +static void perf_event_interrupt(struct pt_regs *regs) +{ + int i; + struct cpu_hw_events *cpuhw = &__get_cpu_var(cpu_hw_events); + struct perf_event *event; + unsigned long val; + int found = 0; + int nmi; + + if (cpuhw->n_limited) + freeze_limited_events(cpuhw, mfspr(SPRN_PMC5), + mfspr(SPRN_PMC6)); + + perf_read_regs(regs); + + nmi = perf_intr_is_nmi(regs); + if (nmi) + nmi_enter(); + else + irq_enter(); + + for (i = 0; i < cpuhw->n_events; ++i) { + event = cpuhw->event[i]; + if (!event->hw.idx || is_limited_pmc(event->hw.idx)) + continue; + val = read_pmc(event->hw.idx); + if ((int)val < 0) { + /* event has overflowed */ + found = 1; + record_and_restart(event, val, regs, nmi); + } + } + + /* + * In case we didn't find and reset the event that caused + * the interrupt, scan all events and reset any that are + * negative, to avoid getting continual interrupts. + * Any that we processed in the previous loop will not be negative. + */ + if (!found) { + for (i = 0; i < ppmu->n_event; ++i) { + if (is_limited_pmc(i + 1)) + continue; + val = read_pmc(i + 1); + if ((int)val < 0) + write_pmc(i + 1, 0); + } + } + + /* + * Reset MMCR0 to its normal value. This will set PMXE and + * clear FC (freeze events) and PMAO (perf mon alert occurred) + * and thus allow interrupts to occur again. + * XXX might want to use MSR.PM to keep the events frozen until + * we get back out of this interrupt. + */ + write_mmcr0(cpuhw, cpuhw->mmcr[0]); + + if (nmi) + nmi_exit(); + else + irq_exit(); +} + +void hw_perf_event_setup(int cpu) +{ + struct cpu_hw_events *cpuhw = &per_cpu(cpu_hw_events, cpu); + + if (!ppmu) + return; + memset(cpuhw, 0, sizeof(*cpuhw)); + cpuhw->mmcr[0] = MMCR0_FC; +} + +int register_power_pmu(struct power_pmu *pmu) +{ + if (ppmu) + return -EBUSY; /* something's already registered */ + + ppmu = pmu; + pr_info("%s performance monitor hardware support registered\n", + pmu->name); + +#ifdef MSR_HV + /* + * Use FCHV to ignore kernel events if MSR.HV is set. + */ + if (mfmsr() & MSR_HV) + freeze_events_kernel = MMCR0_FCHV; +#endif /* CONFIG_PPC64 */ + + return 0; +} diff --git a/arch/powerpc/kernel/power4-pmu.c b/arch/powerpc/kernel/power4-pmu.c index 3c90a3d9173e..2a361cdda635 100644 --- a/arch/powerpc/kernel/power4-pmu.c +++ b/arch/powerpc/kernel/power4-pmu.c @@ -9,7 +9,7 @@ * 2 of the License, or (at your option) any later version. */ #include -#include +#include #include #include #include diff --git a/arch/powerpc/kernel/power5+-pmu.c b/arch/powerpc/kernel/power5+-pmu.c index 31918af3e355..0f4c1c73a6ad 100644 --- a/arch/powerpc/kernel/power5+-pmu.c +++ b/arch/powerpc/kernel/power5+-pmu.c @@ -9,7 +9,7 @@ * 2 of the License, or (at your option) any later version. */ #include -#include +#include #include #include #include diff --git a/arch/powerpc/kernel/power5-pmu.c b/arch/powerpc/kernel/power5-pmu.c index 867f6f663963..c351b3a57fbb 100644 --- a/arch/powerpc/kernel/power5-pmu.c +++ b/arch/powerpc/kernel/power5-pmu.c @@ -9,7 +9,7 @@ * 2 of the License, or (at your option) any later version. */ #include -#include +#include #include #include #include diff --git a/arch/powerpc/kernel/power6-pmu.c b/arch/powerpc/kernel/power6-pmu.c index fa21890531da..ca399ba5034c 100644 --- a/arch/powerpc/kernel/power6-pmu.c +++ b/arch/powerpc/kernel/power6-pmu.c @@ -9,7 +9,7 @@ * 2 of the License, or (at your option) any later version. */ #include -#include +#include #include #include #include diff --git a/arch/powerpc/kernel/power7-pmu.c b/arch/powerpc/kernel/power7-pmu.c index 018d094d92f9..28a4daacdc02 100644 --- a/arch/powerpc/kernel/power7-pmu.c +++ b/arch/powerpc/kernel/power7-pmu.c @@ -9,7 +9,7 @@ * 2 of the License, or (at your option) any later version. */ #include -#include +#include #include #include #include diff --git a/arch/powerpc/kernel/ppc970-pmu.c b/arch/powerpc/kernel/ppc970-pmu.c index 75dccb71a043..479574413a93 100644 --- a/arch/powerpc/kernel/ppc970-pmu.c +++ b/arch/powerpc/kernel/ppc970-pmu.c @@ -9,7 +9,7 @@ * 2 of the License, or (at your option) any later version. */ #include -#include +#include #include #include diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c index 465e498bcb33..df45a7449a66 100644 --- a/arch/powerpc/kernel/time.c +++ b/arch/powerpc/kernel/time.c @@ -53,7 +53,7 @@ #include #include #include -#include +#include #include #include @@ -527,25 +527,25 @@ void __init iSeries_time_init_early(void) } #endif /* CONFIG_PPC_ISERIES */ -#if defined(CONFIG_PERF_COUNTERS) && defined(CONFIG_PPC32) -DEFINE_PER_CPU(u8, perf_counter_pending); +#if defined(CONFIG_PERF_EVENTS) && defined(CONFIG_PPC32) +DEFINE_PER_CPU(u8, perf_event_pending); -void set_perf_counter_pending(void) +void set_perf_event_pending(void) { - get_cpu_var(perf_counter_pending) = 1; + get_cpu_var(perf_event_pending) = 1; set_dec(1); - put_cpu_var(perf_counter_pending); + put_cpu_var(perf_event_pending); } -#define test_perf_counter_pending() __get_cpu_var(perf_counter_pending) -#define clear_perf_counter_pending() __get_cpu_var(perf_counter_pending) = 0 +#define test_perf_event_pending() __get_cpu_var(perf_event_pending) +#define clear_perf_event_pending() __get_cpu_var(perf_event_pending) = 0 -#else /* CONFIG_PERF_COUNTERS && CONFIG_PPC32 */ +#else /* CONFIG_PERF_EVENTS && CONFIG_PPC32 */ -#define test_perf_counter_pending() 0 -#define clear_perf_counter_pending() +#define test_perf_event_pending() 0 +#define clear_perf_event_pending() -#endif /* CONFIG_PERF_COUNTERS && CONFIG_PPC32 */ +#endif /* CONFIG_PERF_EVENTS && CONFIG_PPC32 */ /* * For iSeries shared processors, we have to let the hypervisor @@ -573,9 +573,9 @@ void timer_interrupt(struct pt_regs * regs) set_dec(DECREMENTER_MAX); #ifdef CONFIG_PPC32 - if (test_perf_counter_pending()) { - clear_perf_counter_pending(); - perf_counter_do_pending(); + if (test_perf_event_pending()) { + clear_perf_event_pending(); + perf_event_do_pending(); } if (atomic_read(&ppc_n_lost_interrupts) != 0) do_IRQ(regs); diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c index 830bef0a1131..e7dae82c1285 100644 --- a/arch/powerpc/mm/fault.c +++ b/arch/powerpc/mm/fault.c @@ -29,7 +29,7 @@ #include #include #include -#include +#include #include #include @@ -171,7 +171,7 @@ int __kprobes do_page_fault(struct pt_regs *regs, unsigned long address, die("Weird page fault", regs, SIGSEGV); } - perf_swcounter_event(PERF_COUNT_SW_PAGE_FAULTS, 1, 0, regs, address); + perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, 0, regs, address); /* When running in the kernel we expect faults to occur only to * addresses in user space. All other faults represent errors in the @@ -312,7 +312,7 @@ good_area: } if (ret & VM_FAULT_MAJOR) { current->maj_flt++; - perf_swcounter_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, 0, + perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, 0, regs, address); #ifdef CONFIG_PPC_SMLPAR if (firmware_has_feature(FW_FEATURE_CMO)) { @@ -323,7 +323,7 @@ good_area: #endif } else { current->min_flt++; - perf_swcounter_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, 0, + perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, 0, regs, address); } up_read(&mm->mmap_sem); diff --git a/arch/powerpc/platforms/Kconfig.cputype b/arch/powerpc/platforms/Kconfig.cputype index 9efc8bda01b4..e382cae678b8 100644 --- a/arch/powerpc/platforms/Kconfig.cputype +++ b/arch/powerpc/platforms/Kconfig.cputype @@ -280,9 +280,9 @@ config PPC_HAVE_PMU_SUPPORT config PPC_PERF_CTRS def_bool y - depends on PERF_COUNTERS && PPC_HAVE_PMU_SUPPORT + depends on PERF_EVENTS && PPC_HAVE_PMU_SUPPORT help - This enables the powerpc-specific perf_counter back-end. + This enables the powerpc-specific perf_event back-end. config SMP depends on PPC_BOOK3S || PPC_BOOK3E || FSL_BOOKE diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig index 1c866efd217d..43c0acad7160 100644 --- a/arch/s390/Kconfig +++ b/arch/s390/Kconfig @@ -94,7 +94,7 @@ config S390 select HAVE_KVM if 64BIT select HAVE_ARCH_TRACEHOOK select INIT_ALL_POSSIBLE - select HAVE_PERF_COUNTERS + select HAVE_PERF_EVENTS config SCHED_OMIT_FRAME_POINTER bool diff --git a/arch/s390/include/asm/perf_counter.h b/arch/s390/include/asm/perf_counter.h deleted file mode 100644 index 7015188c2cc2..000000000000 --- a/arch/s390/include/asm/perf_counter.h +++ /dev/null @@ -1,10 +0,0 @@ -/* - * Performance counter support - s390 specific definitions. - * - * Copyright 2009 Martin Schwidefsky, IBM Corporation. - */ - -static inline void set_perf_counter_pending(void) {} -static inline void clear_perf_counter_pending(void) {} - -#define PERF_COUNTER_INDEX_OFFSET 0 diff --git a/arch/s390/include/asm/perf_event.h b/arch/s390/include/asm/perf_event.h new file mode 100644 index 000000000000..3840cbe77637 --- /dev/null +++ b/arch/s390/include/asm/perf_event.h @@ -0,0 +1,10 @@ +/* + * Performance event support - s390 specific definitions. + * + * Copyright 2009 Martin Schwidefsky, IBM Corporation. + */ + +static inline void set_perf_event_pending(void) {} +static inline void clear_perf_event_pending(void) {} + +#define PERF_EVENT_INDEX_OFFSET 0 diff --git a/arch/s390/include/asm/unistd.h b/arch/s390/include/asm/unistd.h index c80602d7c880..cb5232df151e 100644 --- a/arch/s390/include/asm/unistd.h +++ b/arch/s390/include/asm/unistd.h @@ -268,7 +268,7 @@ #define __NR_preadv 328 #define __NR_pwritev 329 #define __NR_rt_tgsigqueueinfo 330 -#define __NR_perf_counter_open 331 +#define __NR_perf_event_open 331 #define NR_syscalls 332 /* diff --git a/arch/s390/kernel/compat_wrapper.S b/arch/s390/kernel/compat_wrapper.S index 88a83366819f..624790042d41 100644 --- a/arch/s390/kernel/compat_wrapper.S +++ b/arch/s390/kernel/compat_wrapper.S @@ -1832,11 +1832,11 @@ compat_sys_rt_tgsigqueueinfo_wrapper: llgtr %r5,%r5 # struct compat_siginfo * jg compat_sys_rt_tgsigqueueinfo_wrapper # branch to system call - .globl sys_perf_counter_open_wrapper -sys_perf_counter_open_wrapper: - llgtr %r2,%r2 # const struct perf_counter_attr * + .globl sys_perf_event_open_wrapper +sys_perf_event_open_wrapper: + llgtr %r2,%r2 # const struct perf_event_attr * lgfr %r3,%r3 # pid_t lgfr %r4,%r4 # int lgfr %r5,%r5 # int llgfr %r6,%r6 # unsigned long - jg sys_perf_counter_open # branch to system call + jg sys_perf_event_open # branch to system call diff --git a/arch/s390/kernel/syscalls.S b/arch/s390/kernel/syscalls.S index ad1acd200385..0b5083681e77 100644 --- a/arch/s390/kernel/syscalls.S +++ b/arch/s390/kernel/syscalls.S @@ -339,4 +339,4 @@ SYSCALL(sys_epoll_create1,sys_epoll_create1,sys_epoll_create1_wrapper) SYSCALL(sys_preadv,sys_preadv,compat_sys_preadv_wrapper) SYSCALL(sys_pwritev,sys_pwritev,compat_sys_pwritev_wrapper) SYSCALL(sys_rt_tgsigqueueinfo,sys_rt_tgsigqueueinfo,compat_sys_rt_tgsigqueueinfo_wrapper) /* 330 */ -SYSCALL(sys_perf_counter_open,sys_perf_counter_open,sys_perf_counter_open_wrapper) +SYSCALL(sys_perf_event_open,sys_perf_event_open,sys_perf_event_open_wrapper) diff --git a/arch/s390/mm/fault.c b/arch/s390/mm/fault.c index 1abbadd497e1..6d507462967a 100644 --- a/arch/s390/mm/fault.c +++ b/arch/s390/mm/fault.c @@ -10,7 +10,7 @@ * Copyright (C) 1995 Linus Torvalds */ -#include +#include #include #include #include @@ -306,7 +306,7 @@ do_exception(struct pt_regs *regs, unsigned long error_code, int write) * interrupts again and then search the VMAs */ local_irq_enable(); - perf_swcounter_event(PERF_COUNT_SW_PAGE_FAULTS, 1, 0, regs, address); + perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, 0, regs, address); down_read(&mm->mmap_sem); si_code = SEGV_MAPERR; @@ -366,11 +366,11 @@ good_area: } if (fault & VM_FAULT_MAJOR) { tsk->maj_flt++; - perf_swcounter_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, 0, + perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, 0, regs, address); } else { tsk->min_flt++; - perf_swcounter_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, 0, + perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, 0, regs, address); } up_read(&mm->mmap_sem); diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig index 4df3570fe511..b940424f8ccc 100644 --- a/arch/sh/Kconfig +++ b/arch/sh/Kconfig @@ -16,7 +16,7 @@ config SUPERH select HAVE_IOREMAP_PROT if MMU select HAVE_ARCH_TRACEHOOK select HAVE_DMA_API_DEBUG - select HAVE_PERF_COUNTERS + select HAVE_PERF_EVENTS select HAVE_KERNEL_GZIP select HAVE_KERNEL_BZIP2 select HAVE_KERNEL_LZMA diff --git a/arch/sh/include/asm/perf_counter.h b/arch/sh/include/asm/perf_counter.h deleted file mode 100644 index d8e6bb9c0ccc..000000000000 --- a/arch/sh/include/asm/perf_counter.h +++ /dev/null @@ -1,9 +0,0 @@ -#ifndef __ASM_SH_PERF_COUNTER_H -#define __ASM_SH_PERF_COUNTER_H - -/* SH only supports software counters through this interface. */ -static inline void set_perf_counter_pending(void) {} - -#define PERF_COUNTER_INDEX_OFFSET 0 - -#endif /* __ASM_SH_PERF_COUNTER_H */ diff --git a/arch/sh/include/asm/perf_event.h b/arch/sh/include/asm/perf_event.h new file mode 100644 index 000000000000..11a302297ab7 --- /dev/null +++ b/arch/sh/include/asm/perf_event.h @@ -0,0 +1,9 @@ +#ifndef __ASM_SH_PERF_EVENT_H +#define __ASM_SH_PERF_EVENT_H + +/* SH only supports software events through this interface. */ +static inline void set_perf_event_pending(void) {} + +#define PERF_EVENT_INDEX_OFFSET 0 + +#endif /* __ASM_SH_PERF_EVENT_H */ diff --git a/arch/sh/include/asm/unistd_32.h b/arch/sh/include/asm/unistd_32.h index 925dd40d9d55..f3fd1b9eb6b1 100644 --- a/arch/sh/include/asm/unistd_32.h +++ b/arch/sh/include/asm/unistd_32.h @@ -344,7 +344,7 @@ #define __NR_preadv 333 #define __NR_pwritev 334 #define __NR_rt_tgsigqueueinfo 335 -#define __NR_perf_counter_open 336 +#define __NR_perf_event_open 336 #define NR_syscalls 337 diff --git a/arch/sh/include/asm/unistd_64.h b/arch/sh/include/asm/unistd_64.h index 2b84bc916bc5..343ce8f073ea 100644 --- a/arch/sh/include/asm/unistd_64.h +++ b/arch/sh/include/asm/unistd_64.h @@ -384,7 +384,7 @@ #define __NR_preadv 361 #define __NR_pwritev 362 #define __NR_rt_tgsigqueueinfo 363 -#define __NR_perf_counter_open 364 +#define __NR_perf_event_open 364 #ifdef __KERNEL__ diff --git a/arch/sh/kernel/syscalls_32.S b/arch/sh/kernel/syscalls_32.S index 16ba225ede89..19fd11dd9871 100644 --- a/arch/sh/kernel/syscalls_32.S +++ b/arch/sh/kernel/syscalls_32.S @@ -352,4 +352,4 @@ ENTRY(sys_call_table) .long sys_preadv .long sys_pwritev .long sys_rt_tgsigqueueinfo /* 335 */ - .long sys_perf_counter_open + .long sys_perf_event_open diff --git a/arch/sh/kernel/syscalls_64.S b/arch/sh/kernel/syscalls_64.S index af6fb7410c21..5bfde6c77498 100644 --- a/arch/sh/kernel/syscalls_64.S +++ b/arch/sh/kernel/syscalls_64.S @@ -390,4 +390,4 @@ sys_call_table: .long sys_preadv .long sys_pwritev .long sys_rt_tgsigqueueinfo - .long sys_perf_counter_open + .long sys_perf_event_open diff --git a/arch/sh/mm/fault_32.c b/arch/sh/mm/fault_32.c index 781b413ff82d..47530104e0ad 100644 --- a/arch/sh/mm/fault_32.c +++ b/arch/sh/mm/fault_32.c @@ -15,7 +15,7 @@ #include #include #include -#include +#include #include #include #include @@ -157,7 +157,7 @@ asmlinkage void __kprobes do_page_fault(struct pt_regs *regs, if ((regs->sr & SR_IMASK) != SR_IMASK) local_irq_enable(); - perf_swcounter_event(PERF_COUNT_SW_PAGE_FAULTS, 1, 0, regs, address); + perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, 0, regs, address); /* * If we're in an interrupt, have no user context or are running @@ -208,11 +208,11 @@ survive: } if (fault & VM_FAULT_MAJOR) { tsk->maj_flt++; - perf_swcounter_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, 0, + perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, 0, regs, address); } else { tsk->min_flt++; - perf_swcounter_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, 0, + perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, 0, regs, address); } diff --git a/arch/sh/mm/tlbflush_64.c b/arch/sh/mm/tlbflush_64.c index 2dcc48528f7a..de0b0e881823 100644 --- a/arch/sh/mm/tlbflush_64.c +++ b/arch/sh/mm/tlbflush_64.c @@ -20,7 +20,7 @@ #include #include #include -#include +#include #include #include #include @@ -116,7 +116,7 @@ asmlinkage void do_page_fault(struct pt_regs *regs, unsigned long writeaccess, /* Not an IO address, so reenable interrupts */ local_irq_enable(); - perf_swcounter_event(PERF_COUNT_SW_PAGE_FAULTS, 1, 0, regs, address); + perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, 0, regs, address); /* * If we're in an interrupt or have no user @@ -201,11 +201,11 @@ survive: if (fault & VM_FAULT_MAJOR) { tsk->maj_flt++; - perf_swcounter_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, 0, + perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, 0, regs, address); } else { tsk->min_flt++; - perf_swcounter_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, 0, + perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, 0, regs, address); } diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig index 86b82348b97c..97fca4695e0b 100644 --- a/arch/sparc/Kconfig +++ b/arch/sparc/Kconfig @@ -25,7 +25,7 @@ config SPARC select ARCH_WANT_OPTIONAL_GPIOLIB select RTC_CLASS select RTC_DRV_M48T59 - select HAVE_PERF_COUNTERS + select HAVE_PERF_EVENTS select HAVE_DMA_ATTRS select HAVE_DMA_API_DEBUG @@ -47,7 +47,7 @@ config SPARC64 select RTC_DRV_BQ4802 select RTC_DRV_SUN4V select RTC_DRV_STARFIRE - select HAVE_PERF_COUNTERS + select HAVE_PERF_EVENTS config ARCH_DEFCONFIG string diff --git a/arch/sparc/include/asm/perf_counter.h b/arch/sparc/include/asm/perf_counter.h deleted file mode 100644 index 5d7a8ca0e491..000000000000 --- a/arch/sparc/include/asm/perf_counter.h +++ /dev/null @@ -1,14 +0,0 @@ -#ifndef __ASM_SPARC_PERF_COUNTER_H -#define __ASM_SPARC_PERF_COUNTER_H - -extern void set_perf_counter_pending(void); - -#define PERF_COUNTER_INDEX_OFFSET 0 - -#ifdef CONFIG_PERF_COUNTERS -extern void init_hw_perf_counters(void); -#else -static inline void init_hw_perf_counters(void) { } -#endif - -#endif diff --git a/arch/sparc/include/asm/perf_event.h b/arch/sparc/include/asm/perf_event.h new file mode 100644 index 000000000000..7e2669894ce8 --- /dev/null +++ b/arch/sparc/include/asm/perf_event.h @@ -0,0 +1,14 @@ +#ifndef __ASM_SPARC_PERF_EVENT_H +#define __ASM_SPARC_PERF_EVENT_H + +extern void set_perf_event_pending(void); + +#define PERF_EVENT_INDEX_OFFSET 0 + +#ifdef CONFIG_PERF_EVENTS +extern void init_hw_perf_events(void); +#else +static inline void init_hw_perf_events(void) { } +#endif + +#endif diff --git a/arch/sparc/include/asm/unistd.h b/arch/sparc/include/asm/unistd.h index 706df669f3b8..42f2316c3eaa 100644 --- a/arch/sparc/include/asm/unistd.h +++ b/arch/sparc/include/asm/unistd.h @@ -395,7 +395,7 @@ #define __NR_preadv 324 #define __NR_pwritev 325 #define __NR_rt_tgsigqueueinfo 326 -#define __NR_perf_counter_open 327 +#define __NR_perf_event_open 327 #define NR_SYSCALLS 328 diff --git a/arch/sparc/kernel/Makefile b/arch/sparc/kernel/Makefile index 247cc620cee5..3a048fad7ee2 100644 --- a/arch/sparc/kernel/Makefile +++ b/arch/sparc/kernel/Makefile @@ -104,5 +104,5 @@ obj-$(CONFIG_AUDIT) += audit.o audit--$(CONFIG_AUDIT) := compat_audit.o obj-$(CONFIG_COMPAT) += $(audit--y) -pc--$(CONFIG_PERF_COUNTERS) := perf_counter.o +pc--$(CONFIG_PERF_EVENTS) := perf_event.o obj-$(CONFIG_SPARC64) += $(pc--y) diff --git a/arch/sparc/kernel/nmi.c b/arch/sparc/kernel/nmi.c index 378eb53e0776..b129611590a4 100644 --- a/arch/sparc/kernel/nmi.c +++ b/arch/sparc/kernel/nmi.c @@ -19,7 +19,7 @@ #include #include -#include +#include #include #include #include @@ -265,7 +265,7 @@ int __init nmi_init(void) } } if (!err) - init_hw_perf_counters(); + init_hw_perf_events(); return err; } diff --git a/arch/sparc/kernel/pcr.c b/arch/sparc/kernel/pcr.c index 68ff00107073..2d94e7a03af5 100644 --- a/arch/sparc/kernel/pcr.c +++ b/arch/sparc/kernel/pcr.c @@ -7,7 +7,7 @@ #include #include -#include +#include #include #include @@ -15,7 +15,7 @@ /* This code is shared between various users of the performance * counters. Users will be oprofile, pseudo-NMI watchdog, and the - * perf_counter support layer. + * perf_event support layer. */ #define PCR_SUN4U_ENABLE (PCR_PIC_PRIV | PCR_STRACE | PCR_UTRACE) @@ -42,14 +42,14 @@ void deferred_pcr_work_irq(int irq, struct pt_regs *regs) old_regs = set_irq_regs(regs); irq_enter(); -#ifdef CONFIG_PERF_COUNTERS - perf_counter_do_pending(); +#ifdef CONFIG_PERF_EVENTS + perf_event_do_pending(); #endif irq_exit(); set_irq_regs(old_regs); } -void set_perf_counter_pending(void) +void set_perf_event_pending(void) { set_softint(1 << PIL_DEFERRED_PCR_WORK); } diff --git a/arch/sparc/kernel/perf_counter.c b/arch/sparc/kernel/perf_counter.c deleted file mode 100644 index b1265ce8a053..000000000000 --- a/arch/sparc/kernel/perf_counter.c +++ /dev/null @@ -1,556 +0,0 @@ -/* Performance counter support for sparc64. - * - * Copyright (C) 2009 David S. Miller - * - * This code is based almost entirely upon the x86 perf counter - * code, which is: - * - * Copyright (C) 2008 Thomas Gleixner - * Copyright (C) 2008-2009 Red Hat, Inc., Ingo Molnar - * Copyright (C) 2009 Jaswinder Singh Rajput - * Copyright (C) 2009 Advanced Micro Devices, Inc., Robert Richter - * Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra - */ - -#include -#include -#include -#include -#include - -#include -#include -#include -#include - -/* Sparc64 chips have two performance counters, 32-bits each, with - * overflow interrupts generated on transition from 0xffffffff to 0. - * The counters are accessed in one go using a 64-bit register. - * - * Both counters are controlled using a single control register. The - * only way to stop all sampling is to clear all of the context (user, - * supervisor, hypervisor) sampling enable bits. But these bits apply - * to both counters, thus the two counters can't be enabled/disabled - * individually. - * - * The control register has two event fields, one for each of the two - * counters. It's thus nearly impossible to have one counter going - * while keeping the other one stopped. Therefore it is possible to - * get overflow interrupts for counters not currently "in use" and - * that condition must be checked in the overflow interrupt handler. - * - * So we use a hack, in that we program inactive counters with the - * "sw_count0" and "sw_count1" events. These count how many times - * the instruction "sethi %hi(0xfc000), %g0" is executed. It's an - * unusual way to encode a NOP and therefore will not trigger in - * normal code. - */ - -#define MAX_HWCOUNTERS 2 -#define MAX_PERIOD ((1UL << 32) - 1) - -#define PIC_UPPER_INDEX 0 -#define PIC_LOWER_INDEX 1 - -struct cpu_hw_counters { - struct perf_counter *counters[MAX_HWCOUNTERS]; - unsigned long used_mask[BITS_TO_LONGS(MAX_HWCOUNTERS)]; - unsigned long active_mask[BITS_TO_LONGS(MAX_HWCOUNTERS)]; - int enabled; -}; -DEFINE_PER_CPU(struct cpu_hw_counters, cpu_hw_counters) = { .enabled = 1, }; - -struct perf_event_map { - u16 encoding; - u8 pic_mask; -#define PIC_NONE 0x00 -#define PIC_UPPER 0x01 -#define PIC_LOWER 0x02 -}; - -struct sparc_pmu { - const struct perf_event_map *(*event_map)(int); - int max_events; - int upper_shift; - int lower_shift; - int event_mask; - int hv_bit; - int irq_bit; - int upper_nop; - int lower_nop; -}; - -static const struct perf_event_map ultra3i_perfmon_event_map[] = { - [PERF_COUNT_HW_CPU_CYCLES] = { 0x0000, PIC_UPPER | PIC_LOWER }, - [PERF_COUNT_HW_INSTRUCTIONS] = { 0x0001, PIC_UPPER | PIC_LOWER }, - [PERF_COUNT_HW_CACHE_REFERENCES] = { 0x0009, PIC_LOWER }, - [PERF_COUNT_HW_CACHE_MISSES] = { 0x0009, PIC_UPPER }, -}; - -static const struct perf_event_map *ultra3i_event_map(int event) -{ - return &ultra3i_perfmon_event_map[event]; -} - -static const struct sparc_pmu ultra3i_pmu = { - .event_map = ultra3i_event_map, - .max_events = ARRAY_SIZE(ultra3i_perfmon_event_map), - .upper_shift = 11, - .lower_shift = 4, - .event_mask = 0x3f, - .upper_nop = 0x1c, - .lower_nop = 0x14, -}; - -static const struct perf_event_map niagara2_perfmon_event_map[] = { - [PERF_COUNT_HW_CPU_CYCLES] = { 0x02ff, PIC_UPPER | PIC_LOWER }, - [PERF_COUNT_HW_INSTRUCTIONS] = { 0x02ff, PIC_UPPER | PIC_LOWER }, - [PERF_COUNT_HW_CACHE_REFERENCES] = { 0x0208, PIC_UPPER | PIC_LOWER }, - [PERF_COUNT_HW_CACHE_MISSES] = { 0x0302, PIC_UPPER | PIC_LOWER }, - [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = { 0x0201, PIC_UPPER | PIC_LOWER }, - [PERF_COUNT_HW_BRANCH_MISSES] = { 0x0202, PIC_UPPER | PIC_LOWER }, -}; - -static const struct perf_event_map *niagara2_event_map(int event) -{ - return &niagara2_perfmon_event_map[event]; -} - -static const struct sparc_pmu niagara2_pmu = { - .event_map = niagara2_event_map, - .max_events = ARRAY_SIZE(niagara2_perfmon_event_map), - .upper_shift = 19, - .lower_shift = 6, - .event_mask = 0xfff, - .hv_bit = 0x8, - .irq_bit = 0x03, - .upper_nop = 0x220, - .lower_nop = 0x220, -}; - -static const struct sparc_pmu *sparc_pmu __read_mostly; - -static u64 event_encoding(u64 event, int idx) -{ - if (idx == PIC_UPPER_INDEX) - event <<= sparc_pmu->upper_shift; - else - event <<= sparc_pmu->lower_shift; - return event; -} - -static u64 mask_for_index(int idx) -{ - return event_encoding(sparc_pmu->event_mask, idx); -} - -static u64 nop_for_index(int idx) -{ - return event_encoding(idx == PIC_UPPER_INDEX ? - sparc_pmu->upper_nop : - sparc_pmu->lower_nop, idx); -} - -static inline void sparc_pmu_enable_counter(struct hw_perf_counter *hwc, - int idx) -{ - u64 val, mask = mask_for_index(idx); - - val = pcr_ops->read(); - pcr_ops->write((val & ~mask) | hwc->config); -} - -static inline void sparc_pmu_disable_counter(struct hw_perf_counter *hwc, - int idx) -{ - u64 mask = mask_for_index(idx); - u64 nop = nop_for_index(idx); - u64 val = pcr_ops->read(); - - pcr_ops->write((val & ~mask) | nop); -} - -void hw_perf_enable(void) -{ - struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); - u64 val; - int i; - - if (cpuc->enabled) - return; - - cpuc->enabled = 1; - barrier(); - - val = pcr_ops->read(); - - for (i = 0; i < MAX_HWCOUNTERS; i++) { - struct perf_counter *cp = cpuc->counters[i]; - struct hw_perf_counter *hwc; - - if (!cp) - continue; - hwc = &cp->hw; - val |= hwc->config_base; - } - - pcr_ops->write(val); -} - -void hw_perf_disable(void) -{ - struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); - u64 val; - - if (!cpuc->enabled) - return; - - cpuc->enabled = 0; - - val = pcr_ops->read(); - val &= ~(PCR_UTRACE | PCR_STRACE | - sparc_pmu->hv_bit | sparc_pmu->irq_bit); - pcr_ops->write(val); -} - -static u32 read_pmc(int idx) -{ - u64 val; - - read_pic(val); - if (idx == PIC_UPPER_INDEX) - val >>= 32; - - return val & 0xffffffff; -} - -static void write_pmc(int idx, u64 val) -{ - u64 shift, mask, pic; - - shift = 0; - if (idx == PIC_UPPER_INDEX) - shift = 32; - - mask = ((u64) 0xffffffff) << shift; - val <<= shift; - - read_pic(pic); - pic &= ~mask; - pic |= val; - write_pic(pic); -} - -static int sparc_perf_counter_set_period(struct perf_counter *counter, - struct hw_perf_counter *hwc, int idx) -{ - s64 left = atomic64_read(&hwc->period_left); - s64 period = hwc->sample_period; - int ret = 0; - - if (unlikely(left <= -period)) { - left = period; - atomic64_set(&hwc->period_left, left); - hwc->last_period = period; - ret = 1; - } - - if (unlikely(left <= 0)) { - left += period; - atomic64_set(&hwc->period_left, left); - hwc->last_period = period; - ret = 1; - } - if (left > MAX_PERIOD) - left = MAX_PERIOD; - - atomic64_set(&hwc->prev_count, (u64)-left); - - write_pmc(idx, (u64)(-left) & 0xffffffff); - - perf_counter_update_userpage(counter); - - return ret; -} - -static int sparc_pmu_enable(struct perf_counter *counter) -{ - struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); - struct hw_perf_counter *hwc = &counter->hw; - int idx = hwc->idx; - - if (test_and_set_bit(idx, cpuc->used_mask)) - return -EAGAIN; - - sparc_pmu_disable_counter(hwc, idx); - - cpuc->counters[idx] = counter; - set_bit(idx, cpuc->active_mask); - - sparc_perf_counter_set_period(counter, hwc, idx); - sparc_pmu_enable_counter(hwc, idx); - perf_counter_update_userpage(counter); - return 0; -} - -static u64 sparc_perf_counter_update(struct perf_counter *counter, - struct hw_perf_counter *hwc, int idx) -{ - int shift = 64 - 32; - u64 prev_raw_count, new_raw_count; - s64 delta; - -again: - prev_raw_count = atomic64_read(&hwc->prev_count); - new_raw_count = read_pmc(idx); - - if (atomic64_cmpxchg(&hwc->prev_count, prev_raw_count, - new_raw_count) != prev_raw_count) - goto again; - - delta = (new_raw_count << shift) - (prev_raw_count << shift); - delta >>= shift; - - atomic64_add(delta, &counter->count); - atomic64_sub(delta, &hwc->period_left); - - return new_raw_count; -} - -static void sparc_pmu_disable(struct perf_counter *counter) -{ - struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); - struct hw_perf_counter *hwc = &counter->hw; - int idx = hwc->idx; - - clear_bit(idx, cpuc->active_mask); - sparc_pmu_disable_counter(hwc, idx); - - barrier(); - - sparc_perf_counter_update(counter, hwc, idx); - cpuc->counters[idx] = NULL; - clear_bit(idx, cpuc->used_mask); - - perf_counter_update_userpage(counter); -} - -static void sparc_pmu_read(struct perf_counter *counter) -{ - struct hw_perf_counter *hwc = &counter->hw; - sparc_perf_counter_update(counter, hwc, hwc->idx); -} - -static void sparc_pmu_unthrottle(struct perf_counter *counter) -{ - struct hw_perf_counter *hwc = &counter->hw; - sparc_pmu_enable_counter(hwc, hwc->idx); -} - -static atomic_t active_counters = ATOMIC_INIT(0); -static DEFINE_MUTEX(pmc_grab_mutex); - -void perf_counter_grab_pmc(void) -{ - if (atomic_inc_not_zero(&active_counters)) - return; - - mutex_lock(&pmc_grab_mutex); - if (atomic_read(&active_counters) == 0) { - if (atomic_read(&nmi_active) > 0) { - on_each_cpu(stop_nmi_watchdog, NULL, 1); - BUG_ON(atomic_read(&nmi_active) != 0); - } - atomic_inc(&active_counters); - } - mutex_unlock(&pmc_grab_mutex); -} - -void perf_counter_release_pmc(void) -{ - if (atomic_dec_and_mutex_lock(&active_counters, &pmc_grab_mutex)) { - if (atomic_read(&nmi_active) == 0) - on_each_cpu(start_nmi_watchdog, NULL, 1); - mutex_unlock(&pmc_grab_mutex); - } -} - -static void hw_perf_counter_destroy(struct perf_counter *counter) -{ - perf_counter_release_pmc(); -} - -static int __hw_perf_counter_init(struct perf_counter *counter) -{ - struct perf_counter_attr *attr = &counter->attr; - struct hw_perf_counter *hwc = &counter->hw; - const struct perf_event_map *pmap; - u64 enc; - - if (atomic_read(&nmi_active) < 0) - return -ENODEV; - - if (attr->type != PERF_TYPE_HARDWARE) - return -EOPNOTSUPP; - - if (attr->config >= sparc_pmu->max_events) - return -EINVAL; - - perf_counter_grab_pmc(); - counter->destroy = hw_perf_counter_destroy; - - /* We save the enable bits in the config_base. So to - * turn off sampling just write 'config', and to enable - * things write 'config | config_base'. - */ - hwc->config_base = sparc_pmu->irq_bit; - if (!attr->exclude_user) - hwc->config_base |= PCR_UTRACE; - if (!attr->exclude_kernel) - hwc->config_base |= PCR_STRACE; - if (!attr->exclude_hv) - hwc->config_base |= sparc_pmu->hv_bit; - - if (!hwc->sample_period) { - hwc->sample_period = MAX_PERIOD; - hwc->last_period = hwc->sample_period; - atomic64_set(&hwc->period_left, hwc->sample_period); - } - - pmap = sparc_pmu->event_map(attr->config); - - enc = pmap->encoding; - if (pmap->pic_mask & PIC_UPPER) { - hwc->idx = PIC_UPPER_INDEX; - enc <<= sparc_pmu->upper_shift; - } else { - hwc->idx = PIC_LOWER_INDEX; - enc <<= sparc_pmu->lower_shift; - } - - hwc->config |= enc; - return 0; -} - -static const struct pmu pmu = { - .enable = sparc_pmu_enable, - .disable = sparc_pmu_disable, - .read = sparc_pmu_read, - .unthrottle = sparc_pmu_unthrottle, -}; - -const struct pmu *hw_perf_counter_init(struct perf_counter *counter) -{ - int err = __hw_perf_counter_init(counter); - - if (err) - return ERR_PTR(err); - return &pmu; -} - -void perf_counter_print_debug(void) -{ - unsigned long flags; - u64 pcr, pic; - int cpu; - - if (!sparc_pmu) - return; - - local_irq_save(flags); - - cpu = smp_processor_id(); - - pcr = pcr_ops->read(); - read_pic(pic); - - pr_info("\n"); - pr_info("CPU#%d: PCR[%016llx] PIC[%016llx]\n", - cpu, pcr, pic); - - local_irq_restore(flags); -} - -static int __kprobes perf_counter_nmi_handler(struct notifier_block *self, - unsigned long cmd, void *__args) -{ - struct die_args *args = __args; - struct perf_sample_data data; - struct cpu_hw_counters *cpuc; - struct pt_regs *regs; - int idx; - - if (!atomic_read(&active_counters)) - return NOTIFY_DONE; - - switch (cmd) { - case DIE_NMI: - break; - - default: - return NOTIFY_DONE; - } - - regs = args->regs; - - data.addr = 0; - - cpuc = &__get_cpu_var(cpu_hw_counters); - for (idx = 0; idx < MAX_HWCOUNTERS; idx++) { - struct perf_counter *counter = cpuc->counters[idx]; - struct hw_perf_counter *hwc; - u64 val; - - if (!test_bit(idx, cpuc->active_mask)) - continue; - hwc = &counter->hw; - val = sparc_perf_counter_update(counter, hwc, idx); - if (val & (1ULL << 31)) - continue; - - data.period = counter->hw.last_period; - if (!sparc_perf_counter_set_period(counter, hwc, idx)) - continue; - - if (perf_counter_overflow(counter, 1, &data, regs)) - sparc_pmu_disable_counter(hwc, idx); - } - - return NOTIFY_STOP; -} - -static __read_mostly struct notifier_block perf_counter_nmi_notifier = { - .notifier_call = perf_counter_nmi_handler, -}; - -static bool __init supported_pmu(void) -{ - if (!strcmp(sparc_pmu_type, "ultra3i")) { - sparc_pmu = &ultra3i_pmu; - return true; - } - if (!strcmp(sparc_pmu_type, "niagara2")) { - sparc_pmu = &niagara2_pmu; - return true; - } - return false; -} - -void __init init_hw_perf_counters(void) -{ - pr_info("Performance counters: "); - - if (!supported_pmu()) { - pr_cont("No support for PMU type '%s'\n", sparc_pmu_type); - return; - } - - pr_cont("Supported PMU type is '%s'\n", sparc_pmu_type); - - /* All sparc64 PMUs currently have 2 counters. But this simple - * driver only supports one active counter at a time. - */ - perf_max_counters = 1; - - register_die_notifier(&perf_counter_nmi_notifier); -} diff --git a/arch/sparc/kernel/perf_event.c b/arch/sparc/kernel/perf_event.c new file mode 100644 index 000000000000..2d6a1b10c81d --- /dev/null +++ b/arch/sparc/kernel/perf_event.c @@ -0,0 +1,556 @@ +/* Performance event support for sparc64. + * + * Copyright (C) 2009 David S. Miller + * + * This code is based almost entirely upon the x86 perf event + * code, which is: + * + * Copyright (C) 2008 Thomas Gleixner + * Copyright (C) 2008-2009 Red Hat, Inc., Ingo Molnar + * Copyright (C) 2009 Jaswinder Singh Rajput + * Copyright (C) 2009 Advanced Micro Devices, Inc., Robert Richter + * Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra + */ + +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +/* Sparc64 chips have two performance counters, 32-bits each, with + * overflow interrupts generated on transition from 0xffffffff to 0. + * The counters are accessed in one go using a 64-bit register. + * + * Both counters are controlled using a single control register. The + * only way to stop all sampling is to clear all of the context (user, + * supervisor, hypervisor) sampling enable bits. But these bits apply + * to both counters, thus the two counters can't be enabled/disabled + * individually. + * + * The control register has two event fields, one for each of the two + * counters. It's thus nearly impossible to have one counter going + * while keeping the other one stopped. Therefore it is possible to + * get overflow interrupts for counters not currently "in use" and + * that condition must be checked in the overflow interrupt handler. + * + * So we use a hack, in that we program inactive counters with the + * "sw_count0" and "sw_count1" events. These count how many times + * the instruction "sethi %hi(0xfc000), %g0" is executed. It's an + * unusual way to encode a NOP and therefore will not trigger in + * normal code. + */ + +#define MAX_HWEVENTS 2 +#define MAX_PERIOD ((1UL << 32) - 1) + +#define PIC_UPPER_INDEX 0 +#define PIC_LOWER_INDEX 1 + +struct cpu_hw_events { + struct perf_event *events[MAX_HWEVENTS]; + unsigned long used_mask[BITS_TO_LONGS(MAX_HWEVENTS)]; + unsigned long active_mask[BITS_TO_LONGS(MAX_HWEVENTS)]; + int enabled; +}; +DEFINE_PER_CPU(struct cpu_hw_events, cpu_hw_events) = { .enabled = 1, }; + +struct perf_event_map { + u16 encoding; + u8 pic_mask; +#define PIC_NONE 0x00 +#define PIC_UPPER 0x01 +#define PIC_LOWER 0x02 +}; + +struct sparc_pmu { + const struct perf_event_map *(*event_map)(int); + int max_events; + int upper_shift; + int lower_shift; + int event_mask; + int hv_bit; + int irq_bit; + int upper_nop; + int lower_nop; +}; + +static const struct perf_event_map ultra3i_perfmon_event_map[] = { + [PERF_COUNT_HW_CPU_CYCLES] = { 0x0000, PIC_UPPER | PIC_LOWER }, + [PERF_COUNT_HW_INSTRUCTIONS] = { 0x0001, PIC_UPPER | PIC_LOWER }, + [PERF_COUNT_HW_CACHE_REFERENCES] = { 0x0009, PIC_LOWER }, + [PERF_COUNT_HW_CACHE_MISSES] = { 0x0009, PIC_UPPER }, +}; + +static const struct perf_event_map *ultra3i_event_map(int event_id) +{ + return &ultra3i_perfmon_event_map[event_id]; +} + +static const struct sparc_pmu ultra3i_pmu = { + .event_map = ultra3i_event_map, + .max_events = ARRAY_SIZE(ultra3i_perfmon_event_map), + .upper_shift = 11, + .lower_shift = 4, + .event_mask = 0x3f, + .upper_nop = 0x1c, + .lower_nop = 0x14, +}; + +static const struct perf_event_map niagara2_perfmon_event_map[] = { + [PERF_COUNT_HW_CPU_CYCLES] = { 0x02ff, PIC_UPPER | PIC_LOWER }, + [PERF_COUNT_HW_INSTRUCTIONS] = { 0x02ff, PIC_UPPER | PIC_LOWER }, + [PERF_COUNT_HW_CACHE_REFERENCES] = { 0x0208, PIC_UPPER | PIC_LOWER }, + [PERF_COUNT_HW_CACHE_MISSES] = { 0x0302, PIC_UPPER | PIC_LOWER }, + [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = { 0x0201, PIC_UPPER | PIC_LOWER }, + [PERF_COUNT_HW_BRANCH_MISSES] = { 0x0202, PIC_UPPER | PIC_LOWER }, +}; + +static const struct perf_event_map *niagara2_event_map(int event_id) +{ + return &niagara2_perfmon_event_map[event_id]; +} + +static const struct sparc_pmu niagara2_pmu = { + .event_map = niagara2_event_map, + .max_events = ARRAY_SIZE(niagara2_perfmon_event_map), + .upper_shift = 19, + .lower_shift = 6, + .event_mask = 0xfff, + .hv_bit = 0x8, + .irq_bit = 0x03, + .upper_nop = 0x220, + .lower_nop = 0x220, +}; + +static const struct sparc_pmu *sparc_pmu __read_mostly; + +static u64 event_encoding(u64 event_id, int idx) +{ + if (idx == PIC_UPPER_INDEX) + event_id <<= sparc_pmu->upper_shift; + else + event_id <<= sparc_pmu->lower_shift; + return event_id; +} + +static u64 mask_for_index(int idx) +{ + return event_encoding(sparc_pmu->event_mask, idx); +} + +static u64 nop_for_index(int idx) +{ + return event_encoding(idx == PIC_UPPER_INDEX ? + sparc_pmu->upper_nop : + sparc_pmu->lower_nop, idx); +} + +static inline void sparc_pmu_enable_event(struct hw_perf_event *hwc, + int idx) +{ + u64 val, mask = mask_for_index(idx); + + val = pcr_ops->read(); + pcr_ops->write((val & ~mask) | hwc->config); +} + +static inline void sparc_pmu_disable_event(struct hw_perf_event *hwc, + int idx) +{ + u64 mask = mask_for_index(idx); + u64 nop = nop_for_index(idx); + u64 val = pcr_ops->read(); + + pcr_ops->write((val & ~mask) | nop); +} + +void hw_perf_enable(void) +{ + struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); + u64 val; + int i; + + if (cpuc->enabled) + return; + + cpuc->enabled = 1; + barrier(); + + val = pcr_ops->read(); + + for (i = 0; i < MAX_HWEVENTS; i++) { + struct perf_event *cp = cpuc->events[i]; + struct hw_perf_event *hwc; + + if (!cp) + continue; + hwc = &cp->hw; + val |= hwc->config_base; + } + + pcr_ops->write(val); +} + +void hw_perf_disable(void) +{ + struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); + u64 val; + + if (!cpuc->enabled) + return; + + cpuc->enabled = 0; + + val = pcr_ops->read(); + val &= ~(PCR_UTRACE | PCR_STRACE | + sparc_pmu->hv_bit | sparc_pmu->irq_bit); + pcr_ops->write(val); +} + +static u32 read_pmc(int idx) +{ + u64 val; + + read_pic(val); + if (idx == PIC_UPPER_INDEX) + val >>= 32; + + return val & 0xffffffff; +} + +static void write_pmc(int idx, u64 val) +{ + u64 shift, mask, pic; + + shift = 0; + if (idx == PIC_UPPER_INDEX) + shift = 32; + + mask = ((u64) 0xffffffff) << shift; + val <<= shift; + + read_pic(pic); + pic &= ~mask; + pic |= val; + write_pic(pic); +} + +static int sparc_perf_event_set_period(struct perf_event *event, + struct hw_perf_event *hwc, int idx) +{ + s64 left = atomic64_read(&hwc->period_left); + s64 period = hwc->sample_period; + int ret = 0; + + if (unlikely(left <= -period)) { + left = period; + atomic64_set(&hwc->period_left, left); + hwc->last_period = period; + ret = 1; + } + + if (unlikely(left <= 0)) { + left += period; + atomic64_set(&hwc->period_left, left); + hwc->last_period = period; + ret = 1; + } + if (left > MAX_PERIOD) + left = MAX_PERIOD; + + atomic64_set(&hwc->prev_count, (u64)-left); + + write_pmc(idx, (u64)(-left) & 0xffffffff); + + perf_event_update_userpage(event); + + return ret; +} + +static int sparc_pmu_enable(struct perf_event *event) +{ + struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); + struct hw_perf_event *hwc = &event->hw; + int idx = hwc->idx; + + if (test_and_set_bit(idx, cpuc->used_mask)) + return -EAGAIN; + + sparc_pmu_disable_event(hwc, idx); + + cpuc->events[idx] = event; + set_bit(idx, cpuc->active_mask); + + sparc_perf_event_set_period(event, hwc, idx); + sparc_pmu_enable_event(hwc, idx); + perf_event_update_userpage(event); + return 0; +} + +static u64 sparc_perf_event_update(struct perf_event *event, + struct hw_perf_event *hwc, int idx) +{ + int shift = 64 - 32; + u64 prev_raw_count, new_raw_count; + s64 delta; + +again: + prev_raw_count = atomic64_read(&hwc->prev_count); + new_raw_count = read_pmc(idx); + + if (atomic64_cmpxchg(&hwc->prev_count, prev_raw_count, + new_raw_count) != prev_raw_count) + goto again; + + delta = (new_raw_count << shift) - (prev_raw_count << shift); + delta >>= shift; + + atomic64_add(delta, &event->count); + atomic64_sub(delta, &hwc->period_left); + + return new_raw_count; +} + +static void sparc_pmu_disable(struct perf_event *event) +{ + struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); + struct hw_perf_event *hwc = &event->hw; + int idx = hwc->idx; + + clear_bit(idx, cpuc->active_mask); + sparc_pmu_disable_event(hwc, idx); + + barrier(); + + sparc_perf_event_update(event, hwc, idx); + cpuc->events[idx] = NULL; + clear_bit(idx, cpuc->used_mask); + + perf_event_update_userpage(event); +} + +static void sparc_pmu_read(struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + sparc_perf_event_update(event, hwc, hwc->idx); +} + +static void sparc_pmu_unthrottle(struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + sparc_pmu_enable_event(hwc, hwc->idx); +} + +static atomic_t active_events = ATOMIC_INIT(0); +static DEFINE_MUTEX(pmc_grab_mutex); + +void perf_event_grab_pmc(void) +{ + if (atomic_inc_not_zero(&active_events)) + return; + + mutex_lock(&pmc_grab_mutex); + if (atomic_read(&active_events) == 0) { + if (atomic_read(&nmi_active) > 0) { + on_each_cpu(stop_nmi_watchdog, NULL, 1); + BUG_ON(atomic_read(&nmi_active) != 0); + } + atomic_inc(&active_events); + } + mutex_unlock(&pmc_grab_mutex); +} + +void perf_event_release_pmc(void) +{ + if (atomic_dec_and_mutex_lock(&active_events, &pmc_grab_mutex)) { + if (atomic_read(&nmi_active) == 0) + on_each_cpu(start_nmi_watchdog, NULL, 1); + mutex_unlock(&pmc_grab_mutex); + } +} + +static void hw_perf_event_destroy(struct perf_event *event) +{ + perf_event_release_pmc(); +} + +static int __hw_perf_event_init(struct perf_event *event) +{ + struct perf_event_attr *attr = &event->attr; + struct hw_perf_event *hwc = &event->hw; + const struct perf_event_map *pmap; + u64 enc; + + if (atomic_read(&nmi_active) < 0) + return -ENODEV; + + if (attr->type != PERF_TYPE_HARDWARE) + return -EOPNOTSUPP; + + if (attr->config >= sparc_pmu->max_events) + return -EINVAL; + + perf_event_grab_pmc(); + event->destroy = hw_perf_event_destroy; + + /* We save the enable bits in the config_base. So to + * turn off sampling just write 'config', and to enable + * things write 'config | config_base'. + */ + hwc->config_base = sparc_pmu->irq_bit; + if (!attr->exclude_user) + hwc->config_base |= PCR_UTRACE; + if (!attr->exclude_kernel) + hwc->config_base |= PCR_STRACE; + if (!attr->exclude_hv) + hwc->config_base |= sparc_pmu->hv_bit; + + if (!hwc->sample_period) { + hwc->sample_period = MAX_PERIOD; + hwc->last_period = hwc->sample_period; + atomic64_set(&hwc->period_left, hwc->sample_period); + } + + pmap = sparc_pmu->event_map(attr->config); + + enc = pmap->encoding; + if (pmap->pic_mask & PIC_UPPER) { + hwc->idx = PIC_UPPER_INDEX; + enc <<= sparc_pmu->upper_shift; + } else { + hwc->idx = PIC_LOWER_INDEX; + enc <<= sparc_pmu->lower_shift; + } + + hwc->config |= enc; + return 0; +} + +static const struct pmu pmu = { + .enable = sparc_pmu_enable, + .disable = sparc_pmu_disable, + .read = sparc_pmu_read, + .unthrottle = sparc_pmu_unthrottle, +}; + +const struct pmu *hw_perf_event_init(struct perf_event *event) +{ + int err = __hw_perf_event_init(event); + + if (err) + return ERR_PTR(err); + return &pmu; +} + +void perf_event_print_debug(void) +{ + unsigned long flags; + u64 pcr, pic; + int cpu; + + if (!sparc_pmu) + return; + + local_irq_save(flags); + + cpu = smp_processor_id(); + + pcr = pcr_ops->read(); + read_pic(pic); + + pr_info("\n"); + pr_info("CPU#%d: PCR[%016llx] PIC[%016llx]\n", + cpu, pcr, pic); + + local_irq_restore(flags); +} + +static int __kprobes perf_event_nmi_handler(struct notifier_block *self, + unsigned long cmd, void *__args) +{ + struct die_args *args = __args; + struct perf_sample_data data; + struct cpu_hw_events *cpuc; + struct pt_regs *regs; + int idx; + + if (!atomic_read(&active_events)) + return NOTIFY_DONE; + + switch (cmd) { + case DIE_NMI: + break; + + default: + return NOTIFY_DONE; + } + + regs = args->regs; + + data.addr = 0; + + cpuc = &__get_cpu_var(cpu_hw_events); + for (idx = 0; idx < MAX_HWEVENTS; idx++) { + struct perf_event *event = cpuc->events[idx]; + struct hw_perf_event *hwc; + u64 val; + + if (!test_bit(idx, cpuc->active_mask)) + continue; + hwc = &event->hw; + val = sparc_perf_event_update(event, hwc, idx); + if (val & (1ULL << 31)) + continue; + + data.period = event->hw.last_period; + if (!sparc_perf_event_set_period(event, hwc, idx)) + continue; + + if (perf_event_overflow(event, 1, &data, regs)) + sparc_pmu_disable_event(hwc, idx); + } + + return NOTIFY_STOP; +} + +static __read_mostly struct notifier_block perf_event_nmi_notifier = { + .notifier_call = perf_event_nmi_handler, +}; + +static bool __init supported_pmu(void) +{ + if (!strcmp(sparc_pmu_type, "ultra3i")) { + sparc_pmu = &ultra3i_pmu; + return true; + } + if (!strcmp(sparc_pmu_type, "niagara2")) { + sparc_pmu = &niagara2_pmu; + return true; + } + return false; +} + +void __init init_hw_perf_events(void) +{ + pr_info("Performance events: "); + + if (!supported_pmu()) { + pr_cont("No support for PMU type '%s'\n", sparc_pmu_type); + return; + } + + pr_cont("Supported PMU type is '%s'\n", sparc_pmu_type); + + /* All sparc64 PMUs currently have 2 events. But this simple + * driver only supports one active event at a time. + */ + perf_max_events = 1; + + register_die_notifier(&perf_event_nmi_notifier); +} diff --git a/arch/sparc/kernel/systbls_32.S b/arch/sparc/kernel/systbls_32.S index 04181577cb65..0f1658d37490 100644 --- a/arch/sparc/kernel/systbls_32.S +++ b/arch/sparc/kernel/systbls_32.S @@ -82,5 +82,5 @@ sys_call_table: /*310*/ .long sys_utimensat, sys_signalfd, sys_timerfd_create, sys_eventfd, sys_fallocate /*315*/ .long sys_timerfd_settime, sys_timerfd_gettime, sys_signalfd4, sys_eventfd2, sys_epoll_create1 /*320*/ .long sys_dup3, sys_pipe2, sys_inotify_init1, sys_accept4, sys_preadv -/*325*/ .long sys_pwritev, sys_rt_tgsigqueueinfo, sys_perf_counter_open +/*325*/ .long sys_pwritev, sys_rt_tgsigqueueinfo, sys_perf_event_open diff --git a/arch/sparc/kernel/systbls_64.S b/arch/sparc/kernel/systbls_64.S index 91b06b7f7acf..009825f6e73c 100644 --- a/arch/sparc/kernel/systbls_64.S +++ b/arch/sparc/kernel/systbls_64.S @@ -83,7 +83,7 @@ sys_call_table32: /*310*/ .word compat_sys_utimensat, compat_sys_signalfd, sys_timerfd_create, sys_eventfd, compat_sys_fallocate .word compat_sys_timerfd_settime, compat_sys_timerfd_gettime, compat_sys_signalfd4, sys_eventfd2, sys_epoll_create1 /*320*/ .word sys_dup3, sys_pipe2, sys_inotify_init1, sys_accept4, compat_sys_preadv - .word compat_sys_pwritev, compat_sys_rt_tgsigqueueinfo, sys_perf_counter_open + .word compat_sys_pwritev, compat_sys_rt_tgsigqueueinfo, sys_perf_event_open #endif /* CONFIG_COMPAT */ @@ -158,4 +158,4 @@ sys_call_table: /*310*/ .word sys_utimensat, sys_signalfd, sys_timerfd_create, sys_eventfd, sys_fallocate .word sys_timerfd_settime, sys_timerfd_gettime, sys_signalfd4, sys_eventfd2, sys_epoll_create1 /*320*/ .word sys_dup3, sys_pipe2, sys_inotify_init1, sys_accept4, sys_preadv - .word sys_pwritev, sys_rt_tgsigqueueinfo, sys_perf_counter_open + .word sys_pwritev, sys_rt_tgsigqueueinfo, sys_perf_event_open diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 51c59015b280..e4ff5d1280ca 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -24,7 +24,7 @@ config X86 select HAVE_UNSTABLE_SCHED_CLOCK select HAVE_IDE select HAVE_OPROFILE - select HAVE_PERF_COUNTERS if (!M386 && !M486) + select HAVE_PERF_EVENTS if (!M386 && !M486) select HAVE_IOREMAP_PROT select HAVE_KPROBES select ARCH_WANT_OPTIONAL_GPIOLIB diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S index ba331bfd1112..74619c4f9fda 100644 --- a/arch/x86/ia32/ia32entry.S +++ b/arch/x86/ia32/ia32entry.S @@ -831,5 +831,5 @@ ia32_sys_call_table: .quad compat_sys_preadv .quad compat_sys_pwritev .quad compat_sys_rt_tgsigqueueinfo /* 335 */ - .quad sys_perf_counter_open + .quad sys_perf_event_open ia32_syscall_end: diff --git a/arch/x86/include/asm/entry_arch.h b/arch/x86/include/asm/entry_arch.h index 5e3f2044f0d3..f5693c81a1db 100644 --- a/arch/x86/include/asm/entry_arch.h +++ b/arch/x86/include/asm/entry_arch.h @@ -49,7 +49,7 @@ BUILD_INTERRUPT(apic_timer_interrupt,LOCAL_TIMER_VECTOR) BUILD_INTERRUPT(error_interrupt,ERROR_APIC_VECTOR) BUILD_INTERRUPT(spurious_interrupt,SPURIOUS_APIC_VECTOR) -#ifdef CONFIG_PERF_COUNTERS +#ifdef CONFIG_PERF_EVENTS BUILD_INTERRUPT(perf_pending_interrupt, LOCAL_PENDING_VECTOR) #endif diff --git a/arch/x86/include/asm/perf_counter.h b/arch/x86/include/asm/perf_counter.h deleted file mode 100644 index e7b7c938ae27..000000000000 --- a/arch/x86/include/asm/perf_counter.h +++ /dev/null @@ -1,108 +0,0 @@ -#ifndef _ASM_X86_PERF_COUNTER_H -#define _ASM_X86_PERF_COUNTER_H - -/* - * Performance counter hw details: - */ - -#define X86_PMC_MAX_GENERIC 8 -#define X86_PMC_MAX_FIXED 3 - -#define X86_PMC_IDX_GENERIC 0 -#define X86_PMC_IDX_FIXED 32 -#define X86_PMC_IDX_MAX 64 - -#define MSR_ARCH_PERFMON_PERFCTR0 0xc1 -#define MSR_ARCH_PERFMON_PERFCTR1 0xc2 - -#define MSR_ARCH_PERFMON_EVENTSEL0 0x186 -#define MSR_ARCH_PERFMON_EVENTSEL1 0x187 - -#define ARCH_PERFMON_EVENTSEL0_ENABLE (1 << 22) -#define ARCH_PERFMON_EVENTSEL_INT (1 << 20) -#define ARCH_PERFMON_EVENTSEL_OS (1 << 17) -#define ARCH_PERFMON_EVENTSEL_USR (1 << 16) - -/* - * Includes eventsel and unit mask as well: - */ -#define ARCH_PERFMON_EVENT_MASK 0xffff - -#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_SEL 0x3c -#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_UMASK (0x00 << 8) -#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX 0 -#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_PRESENT \ - (1 << (ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX)) - -#define ARCH_PERFMON_BRANCH_MISSES_RETIRED 6 - -/* - * Intel "Architectural Performance Monitoring" CPUID - * detection/enumeration details: - */ -union cpuid10_eax { - struct { - unsigned int version_id:8; - unsigned int num_counters:8; - unsigned int bit_width:8; - unsigned int mask_length:8; - } split; - unsigned int full; -}; - -union cpuid10_edx { - struct { - unsigned int num_counters_fixed:4; - unsigned int reserved:28; - } split; - unsigned int full; -}; - - -/* - * Fixed-purpose performance counters: - */ - -/* - * All 3 fixed-mode PMCs are configured via this single MSR: - */ -#define MSR_ARCH_PERFMON_FIXED_CTR_CTRL 0x38d - -/* - * The counts are available in three separate MSRs: - */ - -/* Instr_Retired.Any: */ -#define MSR_ARCH_PERFMON_FIXED_CTR0 0x309 -#define X86_PMC_IDX_FIXED_INSTRUCTIONS (X86_PMC_IDX_FIXED + 0) - -/* CPU_CLK_Unhalted.Core: */ -#define MSR_ARCH_PERFMON_FIXED_CTR1 0x30a -#define X86_PMC_IDX_FIXED_CPU_CYCLES (X86_PMC_IDX_FIXED + 1) - -/* CPU_CLK_Unhalted.Ref: */ -#define MSR_ARCH_PERFMON_FIXED_CTR2 0x30b -#define X86_PMC_IDX_FIXED_BUS_CYCLES (X86_PMC_IDX_FIXED + 2) - -/* - * We model BTS tracing as another fixed-mode PMC. - * - * We choose a value in the middle of the fixed counter range, since lower - * values are used by actual fixed counters and higher values are used - * to indicate other overflow conditions in the PERF_GLOBAL_STATUS msr. - */ -#define X86_PMC_IDX_FIXED_BTS (X86_PMC_IDX_FIXED + 16) - - -#ifdef CONFIG_PERF_COUNTERS -extern void init_hw_perf_counters(void); -extern void perf_counters_lapic_init(void); - -#define PERF_COUNTER_INDEX_OFFSET 0 - -#else -static inline void init_hw_perf_counters(void) { } -static inline void perf_counters_lapic_init(void) { } -#endif - -#endif /* _ASM_X86_PERF_COUNTER_H */ diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h new file mode 100644 index 000000000000..ad7ce3fd5065 --- /dev/null +++ b/arch/x86/include/asm/perf_event.h @@ -0,0 +1,108 @@ +#ifndef _ASM_X86_PERF_EVENT_H +#define _ASM_X86_PERF_EVENT_H + +/* + * Performance event hw details: + */ + +#define X86_PMC_MAX_GENERIC 8 +#define X86_PMC_MAX_FIXED 3 + +#define X86_PMC_IDX_GENERIC 0 +#define X86_PMC_IDX_FIXED 32 +#define X86_PMC_IDX_MAX 64 + +#define MSR_ARCH_PERFMON_PERFCTR0 0xc1 +#define MSR_ARCH_PERFMON_PERFCTR1 0xc2 + +#define MSR_ARCH_PERFMON_EVENTSEL0 0x186 +#define MSR_ARCH_PERFMON_EVENTSEL1 0x187 + +#define ARCH_PERFMON_EVENTSEL0_ENABLE (1 << 22) +#define ARCH_PERFMON_EVENTSEL_INT (1 << 20) +#define ARCH_PERFMON_EVENTSEL_OS (1 << 17) +#define ARCH_PERFMON_EVENTSEL_USR (1 << 16) + +/* + * Includes eventsel and unit mask as well: + */ +#define ARCH_PERFMON_EVENT_MASK 0xffff + +#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_SEL 0x3c +#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_UMASK (0x00 << 8) +#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX 0 +#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_PRESENT \ + (1 << (ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX)) + +#define ARCH_PERFMON_BRANCH_MISSES_RETIRED 6 + +/* + * Intel "Architectural Performance Monitoring" CPUID + * detection/enumeration details: + */ +union cpuid10_eax { + struct { + unsigned int version_id:8; + unsigned int num_events:8; + unsigned int bit_width:8; + unsigned int mask_length:8; + } split; + unsigned int full; +}; + +union cpuid10_edx { + struct { + unsigned int num_events_fixed:4; + unsigned int reserved:28; + } split; + unsigned int full; +}; + + +/* + * Fixed-purpose performance events: + */ + +/* + * All 3 fixed-mode PMCs are configured via this single MSR: + */ +#define MSR_ARCH_PERFMON_FIXED_CTR_CTRL 0x38d + +/* + * The counts are available in three separate MSRs: + */ + +/* Instr_Retired.Any: */ +#define MSR_ARCH_PERFMON_FIXED_CTR0 0x309 +#define X86_PMC_IDX_FIXED_INSTRUCTIONS (X86_PMC_IDX_FIXED + 0) + +/* CPU_CLK_Unhalted.Core: */ +#define MSR_ARCH_PERFMON_FIXED_CTR1 0x30a +#define X86_PMC_IDX_FIXED_CPU_CYCLES (X86_PMC_IDX_FIXED + 1) + +/* CPU_CLK_Unhalted.Ref: */ +#define MSR_ARCH_PERFMON_FIXED_CTR2 0x30b +#define X86_PMC_IDX_FIXED_BUS_CYCLES (X86_PMC_IDX_FIXED + 2) + +/* + * We model BTS tracing as another fixed-mode PMC. + * + * We choose a value in the middle of the fixed event range, since lower + * values are used by actual fixed events and higher values are used + * to indicate other overflow conditions in the PERF_GLOBAL_STATUS msr. + */ +#define X86_PMC_IDX_FIXED_BTS (X86_PMC_IDX_FIXED + 16) + + +#ifdef CONFIG_PERF_EVENTS +extern void init_hw_perf_events(void); +extern void perf_events_lapic_init(void); + +#define PERF_EVENT_INDEX_OFFSET 0 + +#else +static inline void init_hw_perf_events(void) { } +static inline void perf_events_lapic_init(void) { } +#endif + +#endif /* _ASM_X86_PERF_EVENT_H */ diff --git a/arch/x86/include/asm/unistd_32.h b/arch/x86/include/asm/unistd_32.h index 8deaada61bc8..6fb3c209a7e3 100644 --- a/arch/x86/include/asm/unistd_32.h +++ b/arch/x86/include/asm/unistd_32.h @@ -341,7 +341,7 @@ #define __NR_preadv 333 #define __NR_pwritev 334 #define __NR_rt_tgsigqueueinfo 335 -#define __NR_perf_counter_open 336 +#define __NR_perf_event_open 336 #ifdef __KERNEL__ diff --git a/arch/x86/include/asm/unistd_64.h b/arch/x86/include/asm/unistd_64.h index b9f3c60de5f7..8d3ad0adbc68 100644 --- a/arch/x86/include/asm/unistd_64.h +++ b/arch/x86/include/asm/unistd_64.h @@ -659,8 +659,8 @@ __SYSCALL(__NR_preadv, sys_preadv) __SYSCALL(__NR_pwritev, sys_pwritev) #define __NR_rt_tgsigqueueinfo 297 __SYSCALL(__NR_rt_tgsigqueueinfo, sys_rt_tgsigqueueinfo) -#define __NR_perf_counter_open 298 -__SYSCALL(__NR_perf_counter_open, sys_perf_counter_open) +#define __NR_perf_event_open 298 +__SYSCALL(__NR_perf_event_open, sys_perf_event_open) #ifndef __NO_STUBS #define __ARCH_WANT_OLD_READDIR diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index a34601f52987..754174d09deb 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -14,7 +14,7 @@ * Mikael Pettersson : PM converted to driver model. */ -#include +#include #include #include #include @@ -35,7 +35,7 @@ #include #include -#include +#include #include #include #include @@ -1189,7 +1189,7 @@ void __cpuinit setup_local_APIC(void) apic_write(APIC_ESR, 0); } #endif - perf_counters_lapic_init(); + perf_events_lapic_init(); preempt_disable(); diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile index 8dd30638fe44..68537e957a9b 100644 --- a/arch/x86/kernel/cpu/Makefile +++ b/arch/x86/kernel/cpu/Makefile @@ -27,7 +27,7 @@ obj-$(CONFIG_CPU_SUP_CENTAUR) += centaur.o obj-$(CONFIG_CPU_SUP_TRANSMETA_32) += transmeta.o obj-$(CONFIG_CPU_SUP_UMC_32) += umc.o -obj-$(CONFIG_PERF_COUNTERS) += perf_counter.o +obj-$(CONFIG_PERF_EVENTS) += perf_event.o obj-$(CONFIG_X86_MCE) += mcheck/ obj-$(CONFIG_MTRR) += mtrr/ diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 2fea97eccf77..cc25c2b4a567 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -13,7 +13,7 @@ #include #include -#include +#include #include #include #include @@ -869,7 +869,7 @@ void __init identify_boot_cpu(void) #else vgetcpu_set_mode(); #endif - init_hw_perf_counters(); + init_hw_perf_events(); } void __cpuinit identify_secondary_cpu(struct cpuinfo_x86 *c) diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c deleted file mode 100644 index b1f115696c84..000000000000 --- a/arch/x86/kernel/cpu/perf_counter.c +++ /dev/null @@ -1,2298 +0,0 @@ -/* - * Performance counter x86 architecture code - * - * Copyright (C) 2008 Thomas Gleixner - * Copyright (C) 2008-2009 Red Hat, Inc., Ingo Molnar - * Copyright (C) 2009 Jaswinder Singh Rajput - * Copyright (C) 2009 Advanced Micro Devices, Inc., Robert Richter - * Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra - * Copyright (C) 2009 Intel Corporation, - * - * For licencing details see kernel-base/COPYING - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include - -static u64 perf_counter_mask __read_mostly; - -/* The maximal number of PEBS counters: */ -#define MAX_PEBS_COUNTERS 4 - -/* The size of a BTS record in bytes: */ -#define BTS_RECORD_SIZE 24 - -/* The size of a per-cpu BTS buffer in bytes: */ -#define BTS_BUFFER_SIZE (BTS_RECORD_SIZE * 2048) - -/* The BTS overflow threshold in bytes from the end of the buffer: */ -#define BTS_OVFL_TH (BTS_RECORD_SIZE * 128) - - -/* - * Bits in the debugctlmsr controlling branch tracing. - */ -#define X86_DEBUGCTL_TR (1 << 6) -#define X86_DEBUGCTL_BTS (1 << 7) -#define X86_DEBUGCTL_BTINT (1 << 8) -#define X86_DEBUGCTL_BTS_OFF_OS (1 << 9) -#define X86_DEBUGCTL_BTS_OFF_USR (1 << 10) - -/* - * A debug store configuration. - * - * We only support architectures that use 64bit fields. - */ -struct debug_store { - u64 bts_buffer_base; - u64 bts_index; - u64 bts_absolute_maximum; - u64 bts_interrupt_threshold; - u64 pebs_buffer_base; - u64 pebs_index; - u64 pebs_absolute_maximum; - u64 pebs_interrupt_threshold; - u64 pebs_counter_reset[MAX_PEBS_COUNTERS]; -}; - -struct cpu_hw_counters { - struct perf_counter *counters[X86_PMC_IDX_MAX]; - unsigned long used_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; - unsigned long active_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; - unsigned long interrupts; - int enabled; - struct debug_store *ds; -}; - -/* - * struct x86_pmu - generic x86 pmu - */ -struct x86_pmu { - const char *name; - int version; - int (*handle_irq)(struct pt_regs *); - void (*disable_all)(void); - void (*enable_all)(void); - void (*enable)(struct hw_perf_counter *, int); - void (*disable)(struct hw_perf_counter *, int); - unsigned eventsel; - unsigned perfctr; - u64 (*event_map)(int); - u64 (*raw_event)(u64); - int max_events; - int num_counters; - int num_counters_fixed; - int counter_bits; - u64 counter_mask; - int apic; - u64 max_period; - u64 intel_ctrl; - void (*enable_bts)(u64 config); - void (*disable_bts)(void); -}; - -static struct x86_pmu x86_pmu __read_mostly; - -static DEFINE_PER_CPU(struct cpu_hw_counters, cpu_hw_counters) = { - .enabled = 1, -}; - -/* - * Not sure about some of these - */ -static const u64 p6_perfmon_event_map[] = -{ - [PERF_COUNT_HW_CPU_CYCLES] = 0x0079, - [PERF_COUNT_HW_INSTRUCTIONS] = 0x00c0, - [PERF_COUNT_HW_CACHE_REFERENCES] = 0x0f2e, - [PERF_COUNT_HW_CACHE_MISSES] = 0x012e, - [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x00c4, - [PERF_COUNT_HW_BRANCH_MISSES] = 0x00c5, - [PERF_COUNT_HW_BUS_CYCLES] = 0x0062, -}; - -static u64 p6_pmu_event_map(int hw_event) -{ - return p6_perfmon_event_map[hw_event]; -} - -/* - * Counter setting that is specified not to count anything. - * We use this to effectively disable a counter. - * - * L2_RQSTS with 0 MESI unit mask. - */ -#define P6_NOP_COUNTER 0x0000002EULL - -static u64 p6_pmu_raw_event(u64 hw_event) -{ -#define P6_EVNTSEL_EVENT_MASK 0x000000FFULL -#define P6_EVNTSEL_UNIT_MASK 0x0000FF00ULL -#define P6_EVNTSEL_EDGE_MASK 0x00040000ULL -#define P6_EVNTSEL_INV_MASK 0x00800000ULL -#define P6_EVNTSEL_COUNTER_MASK 0xFF000000ULL - -#define P6_EVNTSEL_MASK \ - (P6_EVNTSEL_EVENT_MASK | \ - P6_EVNTSEL_UNIT_MASK | \ - P6_EVNTSEL_EDGE_MASK | \ - P6_EVNTSEL_INV_MASK | \ - P6_EVNTSEL_COUNTER_MASK) - - return hw_event & P6_EVNTSEL_MASK; -} - - -/* - * Intel PerfMon v3. Used on Core2 and later. - */ -static const u64 intel_perfmon_event_map[] = -{ - [PERF_COUNT_HW_CPU_CYCLES] = 0x003c, - [PERF_COUNT_HW_INSTRUCTIONS] = 0x00c0, - [PERF_COUNT_HW_CACHE_REFERENCES] = 0x4f2e, - [PERF_COUNT_HW_CACHE_MISSES] = 0x412e, - [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x00c4, - [PERF_COUNT_HW_BRANCH_MISSES] = 0x00c5, - [PERF_COUNT_HW_BUS_CYCLES] = 0x013c, -}; - -static u64 intel_pmu_event_map(int hw_event) -{ - return intel_perfmon_event_map[hw_event]; -} - -/* - * Generalized hw caching related hw_event table, filled - * in on a per model basis. A value of 0 means - * 'not supported', -1 means 'hw_event makes no sense on - * this CPU', any other value means the raw hw_event - * ID. - */ - -#define C(x) PERF_COUNT_HW_CACHE_##x - -static u64 __read_mostly hw_cache_event_ids - [PERF_COUNT_HW_CACHE_MAX] - [PERF_COUNT_HW_CACHE_OP_MAX] - [PERF_COUNT_HW_CACHE_RESULT_MAX]; - -static const u64 nehalem_hw_cache_event_ids - [PERF_COUNT_HW_CACHE_MAX] - [PERF_COUNT_HW_CACHE_OP_MAX] - [PERF_COUNT_HW_CACHE_RESULT_MAX] = -{ - [ C(L1D) ] = { - [ C(OP_READ) ] = { - [ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI */ - [ C(RESULT_MISS) ] = 0x0140, /* L1D_CACHE_LD.I_STATE */ - }, - [ C(OP_WRITE) ] = { - [ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI */ - [ C(RESULT_MISS) ] = 0x0141, /* L1D_CACHE_ST.I_STATE */ - }, - [ C(OP_PREFETCH) ] = { - [ C(RESULT_ACCESS) ] = 0x014e, /* L1D_PREFETCH.REQUESTS */ - [ C(RESULT_MISS) ] = 0x024e, /* L1D_PREFETCH.MISS */ - }, - }, - [ C(L1I ) ] = { - [ C(OP_READ) ] = { - [ C(RESULT_ACCESS) ] = 0x0380, /* L1I.READS */ - [ C(RESULT_MISS) ] = 0x0280, /* L1I.MISSES */ - }, - [ C(OP_WRITE) ] = { - [ C(RESULT_ACCESS) ] = -1, - [ C(RESULT_MISS) ] = -1, - }, - [ C(OP_PREFETCH) ] = { - [ C(RESULT_ACCESS) ] = 0x0, - [ C(RESULT_MISS) ] = 0x0, - }, - }, - [ C(LL ) ] = { - [ C(OP_READ) ] = { - [ C(RESULT_ACCESS) ] = 0x0324, /* L2_RQSTS.LOADS */ - [ C(RESULT_MISS) ] = 0x0224, /* L2_RQSTS.LD_MISS */ - }, - [ C(OP_WRITE) ] = { - [ C(RESULT_ACCESS) ] = 0x0c24, /* L2_RQSTS.RFOS */ - [ C(RESULT_MISS) ] = 0x0824, /* L2_RQSTS.RFO_MISS */ - }, - [ C(OP_PREFETCH) ] = { - [ C(RESULT_ACCESS) ] = 0x4f2e, /* LLC Reference */ - [ C(RESULT_MISS) ] = 0x412e, /* LLC Misses */ - }, - }, - [ C(DTLB) ] = { - [ C(OP_READ) ] = { - [ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI (alias) */ - [ C(RESULT_MISS) ] = 0x0108, /* DTLB_LOAD_MISSES.ANY */ - }, - [ C(OP_WRITE) ] = { - [ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI (alias) */ - [ C(RESULT_MISS) ] = 0x010c, /* MEM_STORE_RETIRED.DTLB_MISS */ - }, - [ C(OP_PREFETCH) ] = { - [ C(RESULT_ACCESS) ] = 0x0, - [ C(RESULT_MISS) ] = 0x0, - }, - }, - [ C(ITLB) ] = { - [ C(OP_READ) ] = { - [ C(RESULT_ACCESS) ] = 0x01c0, /* INST_RETIRED.ANY_P */ - [ C(RESULT_MISS) ] = 0x20c8, /* ITLB_MISS_RETIRED */ - }, - [ C(OP_WRITE) ] = { - [ C(RESULT_ACCESS) ] = -1, - [ C(RESULT_MISS) ] = -1, - }, - [ C(OP_PREFETCH) ] = { - [ C(RESULT_ACCESS) ] = -1, - [ C(RESULT_MISS) ] = -1, - }, - }, - [ C(BPU ) ] = { - [ C(OP_READ) ] = { - [ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ALL_BRANCHES */ - [ C(RESULT_MISS) ] = 0x03e8, /* BPU_CLEARS.ANY */ - }, - [ C(OP_WRITE) ] = { - [ C(RESULT_ACCESS) ] = -1, - [ C(RESULT_MISS) ] = -1, - }, - [ C(OP_PREFETCH) ] = { - [ C(RESULT_ACCESS) ] = -1, - [ C(RESULT_MISS) ] = -1, - }, - }, -}; - -static const u64 core2_hw_cache_event_ids - [PERF_COUNT_HW_CACHE_MAX] - [PERF_COUNT_HW_CACHE_OP_MAX] - [PERF_COUNT_HW_CACHE_RESULT_MAX] = -{ - [ C(L1D) ] = { - [ C(OP_READ) ] = { - [ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI */ - [ C(RESULT_MISS) ] = 0x0140, /* L1D_CACHE_LD.I_STATE */ - }, - [ C(OP_WRITE) ] = { - [ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI */ - [ C(RESULT_MISS) ] = 0x0141, /* L1D_CACHE_ST.I_STATE */ - }, - [ C(OP_PREFETCH) ] = { - [ C(RESULT_ACCESS) ] = 0x104e, /* L1D_PREFETCH.REQUESTS */ - [ C(RESULT_MISS) ] = 0, - }, - }, - [ C(L1I ) ] = { - [ C(OP_READ) ] = { - [ C(RESULT_ACCESS) ] = 0x0080, /* L1I.READS */ - [ C(RESULT_MISS) ] = 0x0081, /* L1I.MISSES */ - }, - [ C(OP_WRITE) ] = { - [ C(RESULT_ACCESS) ] = -1, - [ C(RESULT_MISS) ] = -1, - }, - [ C(OP_PREFETCH) ] = { - [ C(RESULT_ACCESS) ] = 0, - [ C(RESULT_MISS) ] = 0, - }, - }, - [ C(LL ) ] = { - [ C(OP_READ) ] = { - [ C(RESULT_ACCESS) ] = 0x4f29, /* L2_LD.MESI */ - [ C(RESULT_MISS) ] = 0x4129, /* L2_LD.ISTATE */ - }, - [ C(OP_WRITE) ] = { - [ C(RESULT_ACCESS) ] = 0x4f2A, /* L2_ST.MESI */ - [ C(RESULT_MISS) ] = 0x412A, /* L2_ST.ISTATE */ - }, - [ C(OP_PREFETCH) ] = { - [ C(RESULT_ACCESS) ] = 0, - [ C(RESULT_MISS) ] = 0, - }, - }, - [ C(DTLB) ] = { - [ C(OP_READ) ] = { - [ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI (alias) */ - [ C(RESULT_MISS) ] = 0x0208, /* DTLB_MISSES.MISS_LD */ - }, - [ C(OP_WRITE) ] = { - [ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI (alias) */ - [ C(RESULT_MISS) ] = 0x0808, /* DTLB_MISSES.MISS_ST */ - }, - [ C(OP_PREFETCH) ] = { - [ C(RESULT_ACCESS) ] = 0, - [ C(RESULT_MISS) ] = 0, - }, - }, - [ C(ITLB) ] = { - [ C(OP_READ) ] = { - [ C(RESULT_ACCESS) ] = 0x00c0, /* INST_RETIRED.ANY_P */ - [ C(RESULT_MISS) ] = 0x1282, /* ITLBMISSES */ - }, - [ C(OP_WRITE) ] = { - [ C(RESULT_ACCESS) ] = -1, - [ C(RESULT_MISS) ] = -1, - }, - [ C(OP_PREFETCH) ] = { - [ C(RESULT_ACCESS) ] = -1, - [ C(RESULT_MISS) ] = -1, - }, - }, - [ C(BPU ) ] = { - [ C(OP_READ) ] = { - [ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ANY */ - [ C(RESULT_MISS) ] = 0x00c5, /* BP_INST_RETIRED.MISPRED */ - }, - [ C(OP_WRITE) ] = { - [ C(RESULT_ACCESS) ] = -1, - [ C(RESULT_MISS) ] = -1, - }, - [ C(OP_PREFETCH) ] = { - [ C(RESULT_ACCESS) ] = -1, - [ C(RESULT_MISS) ] = -1, - }, - }, -}; - -static const u64 atom_hw_cache_event_ids - [PERF_COUNT_HW_CACHE_MAX] - [PERF_COUNT_HW_CACHE_OP_MAX] - [PERF_COUNT_HW_CACHE_RESULT_MAX] = -{ - [ C(L1D) ] = { - [ C(OP_READ) ] = { - [ C(RESULT_ACCESS) ] = 0x2140, /* L1D_CACHE.LD */ - [ C(RESULT_MISS) ] = 0, - }, - [ C(OP_WRITE) ] = { - [ C(RESULT_ACCESS) ] = 0x2240, /* L1D_CACHE.ST */ - [ C(RESULT_MISS) ] = 0, - }, - [ C(OP_PREFETCH) ] = { - [ C(RESULT_ACCESS) ] = 0x0, - [ C(RESULT_MISS) ] = 0, - }, - }, - [ C(L1I ) ] = { - [ C(OP_READ) ] = { - [ C(RESULT_ACCESS) ] = 0x0380, /* L1I.READS */ - [ C(RESULT_MISS) ] = 0x0280, /* L1I.MISSES */ - }, - [ C(OP_WRITE) ] = { - [ C(RESULT_ACCESS) ] = -1, - [ C(RESULT_MISS) ] = -1, - }, - [ C(OP_PREFETCH) ] = { - [ C(RESULT_ACCESS) ] = 0, - [ C(RESULT_MISS) ] = 0, - }, - }, - [ C(LL ) ] = { - [ C(OP_READ) ] = { - [ C(RESULT_ACCESS) ] = 0x4f29, /* L2_LD.MESI */ - [ C(RESULT_MISS) ] = 0x4129, /* L2_LD.ISTATE */ - }, - [ C(OP_WRITE) ] = { - [ C(RESULT_ACCESS) ] = 0x4f2A, /* L2_ST.MESI */ - [ C(RESULT_MISS) ] = 0x412A, /* L2_ST.ISTATE */ - }, - [ C(OP_PREFETCH) ] = { - [ C(RESULT_ACCESS) ] = 0, - [ C(RESULT_MISS) ] = 0, - }, - }, - [ C(DTLB) ] = { - [ C(OP_READ) ] = { - [ C(RESULT_ACCESS) ] = 0x2140, /* L1D_CACHE_LD.MESI (alias) */ - [ C(RESULT_MISS) ] = 0x0508, /* DTLB_MISSES.MISS_LD */ - }, - [ C(OP_WRITE) ] = { - [ C(RESULT_ACCESS) ] = 0x2240, /* L1D_CACHE_ST.MESI (alias) */ - [ C(RESULT_MISS) ] = 0x0608, /* DTLB_MISSES.MISS_ST */ - }, - [ C(OP_PREFETCH) ] = { - [ C(RESULT_ACCESS) ] = 0, - [ C(RESULT_MISS) ] = 0, - }, - }, - [ C(ITLB) ] = { - [ C(OP_READ) ] = { - [ C(RESULT_ACCESS) ] = 0x00c0, /* INST_RETIRED.ANY_P */ - [ C(RESULT_MISS) ] = 0x0282, /* ITLB.MISSES */ - }, - [ C(OP_WRITE) ] = { - [ C(RESULT_ACCESS) ] = -1, - [ C(RESULT_MISS) ] = -1, - }, - [ C(OP_PREFETCH) ] = { - [ C(RESULT_ACCESS) ] = -1, - [ C(RESULT_MISS) ] = -1, - }, - }, - [ C(BPU ) ] = { - [ C(OP_READ) ] = { - [ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ANY */ - [ C(RESULT_MISS) ] = 0x00c5, /* BP_INST_RETIRED.MISPRED */ - }, - [ C(OP_WRITE) ] = { - [ C(RESULT_ACCESS) ] = -1, - [ C(RESULT_MISS) ] = -1, - }, - [ C(OP_PREFETCH) ] = { - [ C(RESULT_ACCESS) ] = -1, - [ C(RESULT_MISS) ] = -1, - }, - }, -}; - -static u64 intel_pmu_raw_event(u64 hw_event) -{ -#define CORE_EVNTSEL_EVENT_MASK 0x000000FFULL -#define CORE_EVNTSEL_UNIT_MASK 0x0000FF00ULL -#define CORE_EVNTSEL_EDGE_MASK 0x00040000ULL -#define CORE_EVNTSEL_INV_MASK 0x00800000ULL -#define CORE_EVNTSEL_COUNTER_MASK 0xFF000000ULL - -#define CORE_EVNTSEL_MASK \ - (CORE_EVNTSEL_EVENT_MASK | \ - CORE_EVNTSEL_UNIT_MASK | \ - CORE_EVNTSEL_EDGE_MASK | \ - CORE_EVNTSEL_INV_MASK | \ - CORE_EVNTSEL_COUNTER_MASK) - - return hw_event & CORE_EVNTSEL_MASK; -} - -static const u64 amd_hw_cache_event_ids - [PERF_COUNT_HW_CACHE_MAX] - [PERF_COUNT_HW_CACHE_OP_MAX] - [PERF_COUNT_HW_CACHE_RESULT_MAX] = -{ - [ C(L1D) ] = { - [ C(OP_READ) ] = { - [ C(RESULT_ACCESS) ] = 0x0040, /* Data Cache Accesses */ - [ C(RESULT_MISS) ] = 0x0041, /* Data Cache Misses */ - }, - [ C(OP_WRITE) ] = { - [ C(RESULT_ACCESS) ] = 0x0142, /* Data Cache Refills :system */ - [ C(RESULT_MISS) ] = 0, - }, - [ C(OP_PREFETCH) ] = { - [ C(RESULT_ACCESS) ] = 0x0267, /* Data Prefetcher :attempts */ - [ C(RESULT_MISS) ] = 0x0167, /* Data Prefetcher :cancelled */ - }, - }, - [ C(L1I ) ] = { - [ C(OP_READ) ] = { - [ C(RESULT_ACCESS) ] = 0x0080, /* Instruction cache fetches */ - [ C(RESULT_MISS) ] = 0x0081, /* Instruction cache misses */ - }, - [ C(OP_WRITE) ] = { - [ C(RESULT_ACCESS) ] = -1, - [ C(RESULT_MISS) ] = -1, - }, - [ C(OP_PREFETCH) ] = { - [ C(RESULT_ACCESS) ] = 0x014B, /* Prefetch Instructions :Load */ - [ C(RESULT_MISS) ] = 0, - }, - }, - [ C(LL ) ] = { - [ C(OP_READ) ] = { - [ C(RESULT_ACCESS) ] = 0x037D, /* Requests to L2 Cache :IC+DC */ - [ C(RESULT_MISS) ] = 0x037E, /* L2 Cache Misses : IC+DC */ - }, - [ C(OP_WRITE) ] = { - [ C(RESULT_ACCESS) ] = 0x017F, /* L2 Fill/Writeback */ - [ C(RESULT_MISS) ] = 0, - }, - [ C(OP_PREFETCH) ] = { - [ C(RESULT_ACCESS) ] = 0, - [ C(RESULT_MISS) ] = 0, - }, - }, - [ C(DTLB) ] = { - [ C(OP_READ) ] = { - [ C(RESULT_ACCESS) ] = 0x0040, /* Data Cache Accesses */ - [ C(RESULT_MISS) ] = 0x0046, /* L1 DTLB and L2 DLTB Miss */ - }, - [ C(OP_WRITE) ] = { - [ C(RESULT_ACCESS) ] = 0, - [ C(RESULT_MISS) ] = 0, - }, - [ C(OP_PREFETCH) ] = { - [ C(RESULT_ACCESS) ] = 0, - [ C(RESULT_MISS) ] = 0, - }, - }, - [ C(ITLB) ] = { - [ C(OP_READ) ] = { - [ C(RESULT_ACCESS) ] = 0x0080, /* Instruction fecthes */ - [ C(RESULT_MISS) ] = 0x0085, /* Instr. fetch ITLB misses */ - }, - [ C(OP_WRITE) ] = { - [ C(RESULT_ACCESS) ] = -1, - [ C(RESULT_MISS) ] = -1, - }, - [ C(OP_PREFETCH) ] = { - [ C(RESULT_ACCESS) ] = -1, - [ C(RESULT_MISS) ] = -1, - }, - }, - [ C(BPU ) ] = { - [ C(OP_READ) ] = { - [ C(RESULT_ACCESS) ] = 0x00c2, /* Retired Branch Instr. */ - [ C(RESULT_MISS) ] = 0x00c3, /* Retired Mispredicted BI */ - }, - [ C(OP_WRITE) ] = { - [ C(RESULT_ACCESS) ] = -1, - [ C(RESULT_MISS) ] = -1, - }, - [ C(OP_PREFETCH) ] = { - [ C(RESULT_ACCESS) ] = -1, - [ C(RESULT_MISS) ] = -1, - }, - }, -}; - -/* - * AMD Performance Monitor K7 and later. - */ -static const u64 amd_perfmon_event_map[] = -{ - [PERF_COUNT_HW_CPU_CYCLES] = 0x0076, - [PERF_COUNT_HW_INSTRUCTIONS] = 0x00c0, - [PERF_COUNT_HW_CACHE_REFERENCES] = 0x0080, - [PERF_COUNT_HW_CACHE_MISSES] = 0x0081, - [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x00c4, - [PERF_COUNT_HW_BRANCH_MISSES] = 0x00c5, -}; - -static u64 amd_pmu_event_map(int hw_event) -{ - return amd_perfmon_event_map[hw_event]; -} - -static u64 amd_pmu_raw_event(u64 hw_event) -{ -#define K7_EVNTSEL_EVENT_MASK 0x7000000FFULL -#define K7_EVNTSEL_UNIT_MASK 0x00000FF00ULL -#define K7_EVNTSEL_EDGE_MASK 0x000040000ULL -#define K7_EVNTSEL_INV_MASK 0x000800000ULL -#define K7_EVNTSEL_COUNTER_MASK 0x0FF000000ULL - -#define K7_EVNTSEL_MASK \ - (K7_EVNTSEL_EVENT_MASK | \ - K7_EVNTSEL_UNIT_MASK | \ - K7_EVNTSEL_EDGE_MASK | \ - K7_EVNTSEL_INV_MASK | \ - K7_EVNTSEL_COUNTER_MASK) - - return hw_event & K7_EVNTSEL_MASK; -} - -/* - * Propagate counter elapsed time into the generic counter. - * Can only be executed on the CPU where the counter is active. - * Returns the delta events processed. - */ -static u64 -x86_perf_counter_update(struct perf_counter *counter, - struct hw_perf_counter *hwc, int idx) -{ - int shift = 64 - x86_pmu.counter_bits; - u64 prev_raw_count, new_raw_count; - s64 delta; - - if (idx == X86_PMC_IDX_FIXED_BTS) - return 0; - - /* - * Careful: an NMI might modify the previous counter value. - * - * Our tactic to handle this is to first atomically read and - * exchange a new raw count - then add that new-prev delta - * count to the generic counter atomically: - */ -again: - prev_raw_count = atomic64_read(&hwc->prev_count); - rdmsrl(hwc->counter_base + idx, new_raw_count); - - if (atomic64_cmpxchg(&hwc->prev_count, prev_raw_count, - new_raw_count) != prev_raw_count) - goto again; - - /* - * Now we have the new raw value and have updated the prev - * timestamp already. We can now calculate the elapsed delta - * (counter-)time and add that to the generic counter. - * - * Careful, not all hw sign-extends above the physical width - * of the count. - */ - delta = (new_raw_count << shift) - (prev_raw_count << shift); - delta >>= shift; - - atomic64_add(delta, &counter->count); - atomic64_sub(delta, &hwc->period_left); - - return new_raw_count; -} - -static atomic_t active_counters; -static DEFINE_MUTEX(pmc_reserve_mutex); - -static bool reserve_pmc_hardware(void) -{ -#ifdef CONFIG_X86_LOCAL_APIC - int i; - - if (nmi_watchdog == NMI_LOCAL_APIC) - disable_lapic_nmi_watchdog(); - - for (i = 0; i < x86_pmu.num_counters; i++) { - if (!reserve_perfctr_nmi(x86_pmu.perfctr + i)) - goto perfctr_fail; - } - - for (i = 0; i < x86_pmu.num_counters; i++) { - if (!reserve_evntsel_nmi(x86_pmu.eventsel + i)) - goto eventsel_fail; - } -#endif - - return true; - -#ifdef CONFIG_X86_LOCAL_APIC -eventsel_fail: - for (i--; i >= 0; i--) - release_evntsel_nmi(x86_pmu.eventsel + i); - - i = x86_pmu.num_counters; - -perfctr_fail: - for (i--; i >= 0; i--) - release_perfctr_nmi(x86_pmu.perfctr + i); - - if (nmi_watchdog == NMI_LOCAL_APIC) - enable_lapic_nmi_watchdog(); - - return false; -#endif -} - -static void release_pmc_hardware(void) -{ -#ifdef CONFIG_X86_LOCAL_APIC - int i; - - for (i = 0; i < x86_pmu.num_counters; i++) { - release_perfctr_nmi(x86_pmu.perfctr + i); - release_evntsel_nmi(x86_pmu.eventsel + i); - } - - if (nmi_watchdog == NMI_LOCAL_APIC) - enable_lapic_nmi_watchdog(); -#endif -} - -static inline bool bts_available(void) -{ - return x86_pmu.enable_bts != NULL; -} - -static inline void init_debug_store_on_cpu(int cpu) -{ - struct debug_store *ds = per_cpu(cpu_hw_counters, cpu).ds; - - if (!ds) - return; - - wrmsr_on_cpu(cpu, MSR_IA32_DS_AREA, - (u32)((u64)(unsigned long)ds), - (u32)((u64)(unsigned long)ds >> 32)); -} - -static inline void fini_debug_store_on_cpu(int cpu) -{ - if (!per_cpu(cpu_hw_counters, cpu).ds) - return; - - wrmsr_on_cpu(cpu, MSR_IA32_DS_AREA, 0, 0); -} - -static void release_bts_hardware(void) -{ - int cpu; - - if (!bts_available()) - return; - - get_online_cpus(); - - for_each_online_cpu(cpu) - fini_debug_store_on_cpu(cpu); - - for_each_possible_cpu(cpu) { - struct debug_store *ds = per_cpu(cpu_hw_counters, cpu).ds; - - if (!ds) - continue; - - per_cpu(cpu_hw_counters, cpu).ds = NULL; - - kfree((void *)(unsigned long)ds->bts_buffer_base); - kfree(ds); - } - - put_online_cpus(); -} - -static int reserve_bts_hardware(void) -{ - int cpu, err = 0; - - if (!bts_available()) - return 0; - - get_online_cpus(); - - for_each_possible_cpu(cpu) { - struct debug_store *ds; - void *buffer; - - err = -ENOMEM; - buffer = kzalloc(BTS_BUFFER_SIZE, GFP_KERNEL); - if (unlikely(!buffer)) - break; - - ds = kzalloc(sizeof(*ds), GFP_KERNEL); - if (unlikely(!ds)) { - kfree(buffer); - break; - } - - ds->bts_buffer_base = (u64)(unsigned long)buffer; - ds->bts_index = ds->bts_buffer_base; - ds->bts_absolute_maximum = - ds->bts_buffer_base + BTS_BUFFER_SIZE; - ds->bts_interrupt_threshold = - ds->bts_absolute_maximum - BTS_OVFL_TH; - - per_cpu(cpu_hw_counters, cpu).ds = ds; - err = 0; - } - - if (err) - release_bts_hardware(); - else { - for_each_online_cpu(cpu) - init_debug_store_on_cpu(cpu); - } - - put_online_cpus(); - - return err; -} - -static void hw_perf_counter_destroy(struct perf_counter *counter) -{ - if (atomic_dec_and_mutex_lock(&active_counters, &pmc_reserve_mutex)) { - release_pmc_hardware(); - release_bts_hardware(); - mutex_unlock(&pmc_reserve_mutex); - } -} - -static inline int x86_pmu_initialized(void) -{ - return x86_pmu.handle_irq != NULL; -} - -static inline int -set_ext_hw_attr(struct hw_perf_counter *hwc, struct perf_counter_attr *attr) -{ - unsigned int cache_type, cache_op, cache_result; - u64 config, val; - - config = attr->config; - - cache_type = (config >> 0) & 0xff; - if (cache_type >= PERF_COUNT_HW_CACHE_MAX) - return -EINVAL; - - cache_op = (config >> 8) & 0xff; - if (cache_op >= PERF_COUNT_HW_CACHE_OP_MAX) - return -EINVAL; - - cache_result = (config >> 16) & 0xff; - if (cache_result >= PERF_COUNT_HW_CACHE_RESULT_MAX) - return -EINVAL; - - val = hw_cache_event_ids[cache_type][cache_op][cache_result]; - - if (val == 0) - return -ENOENT; - - if (val == -1) - return -EINVAL; - - hwc->config |= val; - - return 0; -} - -static void intel_pmu_enable_bts(u64 config) -{ - unsigned long debugctlmsr; - - debugctlmsr = get_debugctlmsr(); - - debugctlmsr |= X86_DEBUGCTL_TR; - debugctlmsr |= X86_DEBUGCTL_BTS; - debugctlmsr |= X86_DEBUGCTL_BTINT; - - if (!(config & ARCH_PERFMON_EVENTSEL_OS)) - debugctlmsr |= X86_DEBUGCTL_BTS_OFF_OS; - - if (!(config & ARCH_PERFMON_EVENTSEL_USR)) - debugctlmsr |= X86_DEBUGCTL_BTS_OFF_USR; - - update_debugctlmsr(debugctlmsr); -} - -static void intel_pmu_disable_bts(void) -{ - struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); - unsigned long debugctlmsr; - - if (!cpuc->ds) - return; - - debugctlmsr = get_debugctlmsr(); - - debugctlmsr &= - ~(X86_DEBUGCTL_TR | X86_DEBUGCTL_BTS | X86_DEBUGCTL_BTINT | - X86_DEBUGCTL_BTS_OFF_OS | X86_DEBUGCTL_BTS_OFF_USR); - - update_debugctlmsr(debugctlmsr); -} - -/* - * Setup the hardware configuration for a given attr_type - */ -static int __hw_perf_counter_init(struct perf_counter *counter) -{ - struct perf_counter_attr *attr = &counter->attr; - struct hw_perf_counter *hwc = &counter->hw; - u64 config; - int err; - - if (!x86_pmu_initialized()) - return -ENODEV; - - err = 0; - if (!atomic_inc_not_zero(&active_counters)) { - mutex_lock(&pmc_reserve_mutex); - if (atomic_read(&active_counters) == 0) { - if (!reserve_pmc_hardware()) - err = -EBUSY; - else - err = reserve_bts_hardware(); - } - if (!err) - atomic_inc(&active_counters); - mutex_unlock(&pmc_reserve_mutex); - } - if (err) - return err; - - counter->destroy = hw_perf_counter_destroy; - - /* - * Generate PMC IRQs: - * (keep 'enabled' bit clear for now) - */ - hwc->config = ARCH_PERFMON_EVENTSEL_INT; - - /* - * Count user and OS events unless requested not to. - */ - if (!attr->exclude_user) - hwc->config |= ARCH_PERFMON_EVENTSEL_USR; - if (!attr->exclude_kernel) - hwc->config |= ARCH_PERFMON_EVENTSEL_OS; - - if (!hwc->sample_period) { - hwc->sample_period = x86_pmu.max_period; - hwc->last_period = hwc->sample_period; - atomic64_set(&hwc->period_left, hwc->sample_period); - } else { - /* - * If we have a PMU initialized but no APIC - * interrupts, we cannot sample hardware - * counters (user-space has to fall back and - * sample via a hrtimer based software counter): - */ - if (!x86_pmu.apic) - return -EOPNOTSUPP; - } - - /* - * Raw hw_event type provide the config in the hw_event structure - */ - if (attr->type == PERF_TYPE_RAW) { - hwc->config |= x86_pmu.raw_event(attr->config); - return 0; - } - - if (attr->type == PERF_TYPE_HW_CACHE) - return set_ext_hw_attr(hwc, attr); - - if (attr->config >= x86_pmu.max_events) - return -EINVAL; - - /* - * The generic map: - */ - config = x86_pmu.event_map(attr->config); - - if (config == 0) - return -ENOENT; - - if (config == -1LL) - return -EINVAL; - - /* - * Branch tracing: - */ - if ((attr->config == PERF_COUNT_HW_BRANCH_INSTRUCTIONS) && - (hwc->sample_period == 1)) { - /* BTS is not supported by this architecture. */ - if (!bts_available()) - return -EOPNOTSUPP; - - /* BTS is currently only allowed for user-mode. */ - if (hwc->config & ARCH_PERFMON_EVENTSEL_OS) - return -EOPNOTSUPP; - } - - hwc->config |= config; - - return 0; -} - -static void p6_pmu_disable_all(void) -{ - struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); - u64 val; - - if (!cpuc->enabled) - return; - - cpuc->enabled = 0; - barrier(); - - /* p6 only has one enable register */ - rdmsrl(MSR_P6_EVNTSEL0, val); - val &= ~ARCH_PERFMON_EVENTSEL0_ENABLE; - wrmsrl(MSR_P6_EVNTSEL0, val); -} - -static void intel_pmu_disable_all(void) -{ - struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); - - if (!cpuc->enabled) - return; - - cpuc->enabled = 0; - barrier(); - - wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0); - - if (test_bit(X86_PMC_IDX_FIXED_BTS, cpuc->active_mask)) - intel_pmu_disable_bts(); -} - -static void amd_pmu_disable_all(void) -{ - struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); - int idx; - - if (!cpuc->enabled) - return; - - cpuc->enabled = 0; - /* - * ensure we write the disable before we start disabling the - * counters proper, so that amd_pmu_enable_counter() does the - * right thing. - */ - barrier(); - - for (idx = 0; idx < x86_pmu.num_counters; idx++) { - u64 val; - - if (!test_bit(idx, cpuc->active_mask)) - continue; - rdmsrl(MSR_K7_EVNTSEL0 + idx, val); - if (!(val & ARCH_PERFMON_EVENTSEL0_ENABLE)) - continue; - val &= ~ARCH_PERFMON_EVENTSEL0_ENABLE; - wrmsrl(MSR_K7_EVNTSEL0 + idx, val); - } -} - -void hw_perf_disable(void) -{ - if (!x86_pmu_initialized()) - return; - return x86_pmu.disable_all(); -} - -static void p6_pmu_enable_all(void) -{ - struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); - unsigned long val; - - if (cpuc->enabled) - return; - - cpuc->enabled = 1; - barrier(); - - /* p6 only has one enable register */ - rdmsrl(MSR_P6_EVNTSEL0, val); - val |= ARCH_PERFMON_EVENTSEL0_ENABLE; - wrmsrl(MSR_P6_EVNTSEL0, val); -} - -static void intel_pmu_enable_all(void) -{ - struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); - - if (cpuc->enabled) - return; - - cpuc->enabled = 1; - barrier(); - - wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, x86_pmu.intel_ctrl); - - if (test_bit(X86_PMC_IDX_FIXED_BTS, cpuc->active_mask)) { - struct perf_counter *counter = - cpuc->counters[X86_PMC_IDX_FIXED_BTS]; - - if (WARN_ON_ONCE(!counter)) - return; - - intel_pmu_enable_bts(counter->hw.config); - } -} - -static void amd_pmu_enable_all(void) -{ - struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); - int idx; - - if (cpuc->enabled) - return; - - cpuc->enabled = 1; - barrier(); - - for (idx = 0; idx < x86_pmu.num_counters; idx++) { - struct perf_counter *counter = cpuc->counters[idx]; - u64 val; - - if (!test_bit(idx, cpuc->active_mask)) - continue; - - val = counter->hw.config; - val |= ARCH_PERFMON_EVENTSEL0_ENABLE; - wrmsrl(MSR_K7_EVNTSEL0 + idx, val); - } -} - -void hw_perf_enable(void) -{ - if (!x86_pmu_initialized()) - return; - x86_pmu.enable_all(); -} - -static inline u64 intel_pmu_get_status(void) -{ - u64 status; - - rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status); - - return status; -} - -static inline void intel_pmu_ack_status(u64 ack) -{ - wrmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, ack); -} - -static inline void x86_pmu_enable_counter(struct hw_perf_counter *hwc, int idx) -{ - (void)checking_wrmsrl(hwc->config_base + idx, - hwc->config | ARCH_PERFMON_EVENTSEL0_ENABLE); -} - -static inline void x86_pmu_disable_counter(struct hw_perf_counter *hwc, int idx) -{ - (void)checking_wrmsrl(hwc->config_base + idx, hwc->config); -} - -static inline void -intel_pmu_disable_fixed(struct hw_perf_counter *hwc, int __idx) -{ - int idx = __idx - X86_PMC_IDX_FIXED; - u64 ctrl_val, mask; - - mask = 0xfULL << (idx * 4); - - rdmsrl(hwc->config_base, ctrl_val); - ctrl_val &= ~mask; - (void)checking_wrmsrl(hwc->config_base, ctrl_val); -} - -static inline void -p6_pmu_disable_counter(struct hw_perf_counter *hwc, int idx) -{ - struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); - u64 val = P6_NOP_COUNTER; - - if (cpuc->enabled) - val |= ARCH_PERFMON_EVENTSEL0_ENABLE; - - (void)checking_wrmsrl(hwc->config_base + idx, val); -} - -static inline void -intel_pmu_disable_counter(struct hw_perf_counter *hwc, int idx) -{ - if (unlikely(idx == X86_PMC_IDX_FIXED_BTS)) { - intel_pmu_disable_bts(); - return; - } - - if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) { - intel_pmu_disable_fixed(hwc, idx); - return; - } - - x86_pmu_disable_counter(hwc, idx); -} - -static inline void -amd_pmu_disable_counter(struct hw_perf_counter *hwc, int idx) -{ - x86_pmu_disable_counter(hwc, idx); -} - -static DEFINE_PER_CPU(u64 [X86_PMC_IDX_MAX], pmc_prev_left); - -/* - * Set the next IRQ period, based on the hwc->period_left value. - * To be called with the counter disabled in hw: - */ -static int -x86_perf_counter_set_period(struct perf_counter *counter, - struct hw_perf_counter *hwc, int idx) -{ - s64 left = atomic64_read(&hwc->period_left); - s64 period = hwc->sample_period; - int err, ret = 0; - - if (idx == X86_PMC_IDX_FIXED_BTS) - return 0; - - /* - * If we are way outside a reasoable range then just skip forward: - */ - if (unlikely(left <= -period)) { - left = period; - atomic64_set(&hwc->period_left, left); - hwc->last_period = period; - ret = 1; - } - - if (unlikely(left <= 0)) { - left += period; - atomic64_set(&hwc->period_left, left); - hwc->last_period = period; - ret = 1; - } - /* - * Quirk: certain CPUs dont like it if just 1 hw_event is left: - */ - if (unlikely(left < 2)) - left = 2; - - if (left > x86_pmu.max_period) - left = x86_pmu.max_period; - - per_cpu(pmc_prev_left[idx], smp_processor_id()) = left; - - /* - * The hw counter starts counting from this counter offset, - * mark it to be able to extra future deltas: - */ - atomic64_set(&hwc->prev_count, (u64)-left); - - err = checking_wrmsrl(hwc->counter_base + idx, - (u64)(-left) & x86_pmu.counter_mask); - - perf_counter_update_userpage(counter); - - return ret; -} - -static inline void -intel_pmu_enable_fixed(struct hw_perf_counter *hwc, int __idx) -{ - int idx = __idx - X86_PMC_IDX_FIXED; - u64 ctrl_val, bits, mask; - int err; - - /* - * Enable IRQ generation (0x8), - * and enable ring-3 counting (0x2) and ring-0 counting (0x1) - * if requested: - */ - bits = 0x8ULL; - if (hwc->config & ARCH_PERFMON_EVENTSEL_USR) - bits |= 0x2; - if (hwc->config & ARCH_PERFMON_EVENTSEL_OS) - bits |= 0x1; - bits <<= (idx * 4); - mask = 0xfULL << (idx * 4); - - rdmsrl(hwc->config_base, ctrl_val); - ctrl_val &= ~mask; - ctrl_val |= bits; - err = checking_wrmsrl(hwc->config_base, ctrl_val); -} - -static void p6_pmu_enable_counter(struct hw_perf_counter *hwc, int idx) -{ - struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); - u64 val; - - val = hwc->config; - if (cpuc->enabled) - val |= ARCH_PERFMON_EVENTSEL0_ENABLE; - - (void)checking_wrmsrl(hwc->config_base + idx, val); -} - - -static void intel_pmu_enable_counter(struct hw_perf_counter *hwc, int idx) -{ - if (unlikely(idx == X86_PMC_IDX_FIXED_BTS)) { - if (!__get_cpu_var(cpu_hw_counters).enabled) - return; - - intel_pmu_enable_bts(hwc->config); - return; - } - - if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) { - intel_pmu_enable_fixed(hwc, idx); - return; - } - - x86_pmu_enable_counter(hwc, idx); -} - -static void amd_pmu_enable_counter(struct hw_perf_counter *hwc, int idx) -{ - struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); - - if (cpuc->enabled) - x86_pmu_enable_counter(hwc, idx); -} - -static int -fixed_mode_idx(struct perf_counter *counter, struct hw_perf_counter *hwc) -{ - unsigned int hw_event; - - hw_event = hwc->config & ARCH_PERFMON_EVENT_MASK; - - if (unlikely((hw_event == - x86_pmu.event_map(PERF_COUNT_HW_BRANCH_INSTRUCTIONS)) && - (hwc->sample_period == 1))) - return X86_PMC_IDX_FIXED_BTS; - - if (!x86_pmu.num_counters_fixed) - return -1; - - if (unlikely(hw_event == x86_pmu.event_map(PERF_COUNT_HW_INSTRUCTIONS))) - return X86_PMC_IDX_FIXED_INSTRUCTIONS; - if (unlikely(hw_event == x86_pmu.event_map(PERF_COUNT_HW_CPU_CYCLES))) - return X86_PMC_IDX_FIXED_CPU_CYCLES; - if (unlikely(hw_event == x86_pmu.event_map(PERF_COUNT_HW_BUS_CYCLES))) - return X86_PMC_IDX_FIXED_BUS_CYCLES; - - return -1; -} - -/* - * Find a PMC slot for the freshly enabled / scheduled in counter: - */ -static int x86_pmu_enable(struct perf_counter *counter) -{ - struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); - struct hw_perf_counter *hwc = &counter->hw; - int idx; - - idx = fixed_mode_idx(counter, hwc); - if (idx == X86_PMC_IDX_FIXED_BTS) { - /* BTS is already occupied. */ - if (test_and_set_bit(idx, cpuc->used_mask)) - return -EAGAIN; - - hwc->config_base = 0; - hwc->counter_base = 0; - hwc->idx = idx; - } else if (idx >= 0) { - /* - * Try to get the fixed counter, if that is already taken - * then try to get a generic counter: - */ - if (test_and_set_bit(idx, cpuc->used_mask)) - goto try_generic; - - hwc->config_base = MSR_ARCH_PERFMON_FIXED_CTR_CTRL; - /* - * We set it so that counter_base + idx in wrmsr/rdmsr maps to - * MSR_ARCH_PERFMON_FIXED_CTR0 ... CTR2: - */ - hwc->counter_base = - MSR_ARCH_PERFMON_FIXED_CTR0 - X86_PMC_IDX_FIXED; - hwc->idx = idx; - } else { - idx = hwc->idx; - /* Try to get the previous generic counter again */ - if (test_and_set_bit(idx, cpuc->used_mask)) { -try_generic: - idx = find_first_zero_bit(cpuc->used_mask, - x86_pmu.num_counters); - if (idx == x86_pmu.num_counters) - return -EAGAIN; - - set_bit(idx, cpuc->used_mask); - hwc->idx = idx; - } - hwc->config_base = x86_pmu.eventsel; - hwc->counter_base = x86_pmu.perfctr; - } - - perf_counters_lapic_init(); - - x86_pmu.disable(hwc, idx); - - cpuc->counters[idx] = counter; - set_bit(idx, cpuc->active_mask); - - x86_perf_counter_set_period(counter, hwc, idx); - x86_pmu.enable(hwc, idx); - - perf_counter_update_userpage(counter); - - return 0; -} - -static void x86_pmu_unthrottle(struct perf_counter *counter) -{ - struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); - struct hw_perf_counter *hwc = &counter->hw; - - if (WARN_ON_ONCE(hwc->idx >= X86_PMC_IDX_MAX || - cpuc->counters[hwc->idx] != counter)) - return; - - x86_pmu.enable(hwc, hwc->idx); -} - -void perf_counter_print_debug(void) -{ - u64 ctrl, status, overflow, pmc_ctrl, pmc_count, prev_left, fixed; - struct cpu_hw_counters *cpuc; - unsigned long flags; - int cpu, idx; - - if (!x86_pmu.num_counters) - return; - - local_irq_save(flags); - - cpu = smp_processor_id(); - cpuc = &per_cpu(cpu_hw_counters, cpu); - - if (x86_pmu.version >= 2) { - rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, ctrl); - rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status); - rdmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, overflow); - rdmsrl(MSR_ARCH_PERFMON_FIXED_CTR_CTRL, fixed); - - pr_info("\n"); - pr_info("CPU#%d: ctrl: %016llx\n", cpu, ctrl); - pr_info("CPU#%d: status: %016llx\n", cpu, status); - pr_info("CPU#%d: overflow: %016llx\n", cpu, overflow); - pr_info("CPU#%d: fixed: %016llx\n", cpu, fixed); - } - pr_info("CPU#%d: used: %016llx\n", cpu, *(u64 *)cpuc->used_mask); - - for (idx = 0; idx < x86_pmu.num_counters; idx++) { - rdmsrl(x86_pmu.eventsel + idx, pmc_ctrl); - rdmsrl(x86_pmu.perfctr + idx, pmc_count); - - prev_left = per_cpu(pmc_prev_left[idx], cpu); - - pr_info("CPU#%d: gen-PMC%d ctrl: %016llx\n", - cpu, idx, pmc_ctrl); - pr_info("CPU#%d: gen-PMC%d count: %016llx\n", - cpu, idx, pmc_count); - pr_info("CPU#%d: gen-PMC%d left: %016llx\n", - cpu, idx, prev_left); - } - for (idx = 0; idx < x86_pmu.num_counters_fixed; idx++) { - rdmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, pmc_count); - - pr_info("CPU#%d: fixed-PMC%d count: %016llx\n", - cpu, idx, pmc_count); - } - local_irq_restore(flags); -} - -static void intel_pmu_drain_bts_buffer(struct cpu_hw_counters *cpuc) -{ - struct debug_store *ds = cpuc->ds; - struct bts_record { - u64 from; - u64 to; - u64 flags; - }; - struct perf_counter *counter = cpuc->counters[X86_PMC_IDX_FIXED_BTS]; - struct bts_record *at, *top; - struct perf_output_handle handle; - struct perf_event_header header; - struct perf_sample_data data; - struct pt_regs regs; - - if (!counter) - return; - - if (!ds) - return; - - at = (struct bts_record *)(unsigned long)ds->bts_buffer_base; - top = (struct bts_record *)(unsigned long)ds->bts_index; - - if (top <= at) - return; - - ds->bts_index = ds->bts_buffer_base; - - - data.period = counter->hw.last_period; - data.addr = 0; - regs.ip = 0; - - /* - * Prepare a generic sample, i.e. fill in the invariant fields. - * We will overwrite the from and to address before we output - * the sample. - */ - perf_prepare_sample(&header, &data, counter, ®s); - - if (perf_output_begin(&handle, counter, - header.size * (top - at), 1, 1)) - return; - - for (; at < top; at++) { - data.ip = at->from; - data.addr = at->to; - - perf_output_sample(&handle, &header, &data, counter); - } - - perf_output_end(&handle); - - /* There's new data available. */ - counter->hw.interrupts++; - counter->pending_kill = POLL_IN; -} - -static void x86_pmu_disable(struct perf_counter *counter) -{ - struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); - struct hw_perf_counter *hwc = &counter->hw; - int idx = hwc->idx; - - /* - * Must be done before we disable, otherwise the nmi handler - * could reenable again: - */ - clear_bit(idx, cpuc->active_mask); - x86_pmu.disable(hwc, idx); - - /* - * Make sure the cleared pointer becomes visible before we - * (potentially) free the counter: - */ - barrier(); - - /* - * Drain the remaining delta count out of a counter - * that we are disabling: - */ - x86_perf_counter_update(counter, hwc, idx); - - /* Drain the remaining BTS records. */ - if (unlikely(idx == X86_PMC_IDX_FIXED_BTS)) - intel_pmu_drain_bts_buffer(cpuc); - - cpuc->counters[idx] = NULL; - clear_bit(idx, cpuc->used_mask); - - perf_counter_update_userpage(counter); -} - -/* - * Save and restart an expired counter. Called by NMI contexts, - * so it has to be careful about preempting normal counter ops: - */ -static int intel_pmu_save_and_restart(struct perf_counter *counter) -{ - struct hw_perf_counter *hwc = &counter->hw; - int idx = hwc->idx; - int ret; - - x86_perf_counter_update(counter, hwc, idx); - ret = x86_perf_counter_set_period(counter, hwc, idx); - - if (counter->state == PERF_COUNTER_STATE_ACTIVE) - intel_pmu_enable_counter(hwc, idx); - - return ret; -} - -static void intel_pmu_reset(void) -{ - struct debug_store *ds = __get_cpu_var(cpu_hw_counters).ds; - unsigned long flags; - int idx; - - if (!x86_pmu.num_counters) - return; - - local_irq_save(flags); - - printk("clearing PMU state on CPU#%d\n", smp_processor_id()); - - for (idx = 0; idx < x86_pmu.num_counters; idx++) { - checking_wrmsrl(x86_pmu.eventsel + idx, 0ull); - checking_wrmsrl(x86_pmu.perfctr + idx, 0ull); - } - for (idx = 0; idx < x86_pmu.num_counters_fixed; idx++) { - checking_wrmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, 0ull); - } - if (ds) - ds->bts_index = ds->bts_buffer_base; - - local_irq_restore(flags); -} - -static int p6_pmu_handle_irq(struct pt_regs *regs) -{ - struct perf_sample_data data; - struct cpu_hw_counters *cpuc; - struct perf_counter *counter; - struct hw_perf_counter *hwc; - int idx, handled = 0; - u64 val; - - data.addr = 0; - - cpuc = &__get_cpu_var(cpu_hw_counters); - - for (idx = 0; idx < x86_pmu.num_counters; idx++) { - if (!test_bit(idx, cpuc->active_mask)) - continue; - - counter = cpuc->counters[idx]; - hwc = &counter->hw; - - val = x86_perf_counter_update(counter, hwc, idx); - if (val & (1ULL << (x86_pmu.counter_bits - 1))) - continue; - - /* - * counter overflow - */ - handled = 1; - data.period = counter->hw.last_period; - - if (!x86_perf_counter_set_period(counter, hwc, idx)) - continue; - - if (perf_counter_overflow(counter, 1, &data, regs)) - p6_pmu_disable_counter(hwc, idx); - } - - if (handled) - inc_irq_stat(apic_perf_irqs); - - return handled; -} - -/* - * This handler is triggered by the local APIC, so the APIC IRQ handling - * rules apply: - */ -static int intel_pmu_handle_irq(struct pt_regs *regs) -{ - struct perf_sample_data data; - struct cpu_hw_counters *cpuc; - int bit, loops; - u64 ack, status; - - data.addr = 0; - - cpuc = &__get_cpu_var(cpu_hw_counters); - - perf_disable(); - intel_pmu_drain_bts_buffer(cpuc); - status = intel_pmu_get_status(); - if (!status) { - perf_enable(); - return 0; - } - - loops = 0; -again: - if (++loops > 100) { - WARN_ONCE(1, "perfcounters: irq loop stuck!\n"); - perf_counter_print_debug(); - intel_pmu_reset(); - perf_enable(); - return 1; - } - - inc_irq_stat(apic_perf_irqs); - ack = status; - for_each_bit(bit, (unsigned long *)&status, X86_PMC_IDX_MAX) { - struct perf_counter *counter = cpuc->counters[bit]; - - clear_bit(bit, (unsigned long *) &status); - if (!test_bit(bit, cpuc->active_mask)) - continue; - - if (!intel_pmu_save_and_restart(counter)) - continue; - - data.period = counter->hw.last_period; - - if (perf_counter_overflow(counter, 1, &data, regs)) - intel_pmu_disable_counter(&counter->hw, bit); - } - - intel_pmu_ack_status(ack); - - /* - * Repeat if there is more work to be done: - */ - status = intel_pmu_get_status(); - if (status) - goto again; - - perf_enable(); - - return 1; -} - -static int amd_pmu_handle_irq(struct pt_regs *regs) -{ - struct perf_sample_data data; - struct cpu_hw_counters *cpuc; - struct perf_counter *counter; - struct hw_perf_counter *hwc; - int idx, handled = 0; - u64 val; - - data.addr = 0; - - cpuc = &__get_cpu_var(cpu_hw_counters); - - for (idx = 0; idx < x86_pmu.num_counters; idx++) { - if (!test_bit(idx, cpuc->active_mask)) - continue; - - counter = cpuc->counters[idx]; - hwc = &counter->hw; - - val = x86_perf_counter_update(counter, hwc, idx); - if (val & (1ULL << (x86_pmu.counter_bits - 1))) - continue; - - /* - * counter overflow - */ - handled = 1; - data.period = counter->hw.last_period; - - if (!x86_perf_counter_set_period(counter, hwc, idx)) - continue; - - if (perf_counter_overflow(counter, 1, &data, regs)) - amd_pmu_disable_counter(hwc, idx); - } - - if (handled) - inc_irq_stat(apic_perf_irqs); - - return handled; -} - -void smp_perf_pending_interrupt(struct pt_regs *regs) -{ - irq_enter(); - ack_APIC_irq(); - inc_irq_stat(apic_pending_irqs); - perf_counter_do_pending(); - irq_exit(); -} - -void set_perf_counter_pending(void) -{ -#ifdef CONFIG_X86_LOCAL_APIC - apic->send_IPI_self(LOCAL_PENDING_VECTOR); -#endif -} - -void perf_counters_lapic_init(void) -{ -#ifdef CONFIG_X86_LOCAL_APIC - if (!x86_pmu.apic || !x86_pmu_initialized()) - return; - - /* - * Always use NMI for PMU - */ - apic_write(APIC_LVTPC, APIC_DM_NMI); -#endif -} - -static int __kprobes -perf_counter_nmi_handler(struct notifier_block *self, - unsigned long cmd, void *__args) -{ - struct die_args *args = __args; - struct pt_regs *regs; - - if (!atomic_read(&active_counters)) - return NOTIFY_DONE; - - switch (cmd) { - case DIE_NMI: - case DIE_NMI_IPI: - break; - - default: - return NOTIFY_DONE; - } - - regs = args->regs; - -#ifdef CONFIG_X86_LOCAL_APIC - apic_write(APIC_LVTPC, APIC_DM_NMI); -#endif - /* - * Can't rely on the handled return value to say it was our NMI, two - * counters could trigger 'simultaneously' raising two back-to-back NMIs. - * - * If the first NMI handles both, the latter will be empty and daze - * the CPU. - */ - x86_pmu.handle_irq(regs); - - return NOTIFY_STOP; -} - -static __read_mostly struct notifier_block perf_counter_nmi_notifier = { - .notifier_call = perf_counter_nmi_handler, - .next = NULL, - .priority = 1 -}; - -static struct x86_pmu p6_pmu = { - .name = "p6", - .handle_irq = p6_pmu_handle_irq, - .disable_all = p6_pmu_disable_all, - .enable_all = p6_pmu_enable_all, - .enable = p6_pmu_enable_counter, - .disable = p6_pmu_disable_counter, - .eventsel = MSR_P6_EVNTSEL0, - .perfctr = MSR_P6_PERFCTR0, - .event_map = p6_pmu_event_map, - .raw_event = p6_pmu_raw_event, - .max_events = ARRAY_SIZE(p6_perfmon_event_map), - .apic = 1, - .max_period = (1ULL << 31) - 1, - .version = 0, - .num_counters = 2, - /* - * Counters have 40 bits implemented. However they are designed such - * that bits [32-39] are sign extensions of bit 31. As such the - * effective width of a counter for P6-like PMU is 32 bits only. - * - * See IA-32 Intel Architecture Software developer manual Vol 3B - */ - .counter_bits = 32, - .counter_mask = (1ULL << 32) - 1, -}; - -static struct x86_pmu intel_pmu = { - .name = "Intel", - .handle_irq = intel_pmu_handle_irq, - .disable_all = intel_pmu_disable_all, - .enable_all = intel_pmu_enable_all, - .enable = intel_pmu_enable_counter, - .disable = intel_pmu_disable_counter, - .eventsel = MSR_ARCH_PERFMON_EVENTSEL0, - .perfctr = MSR_ARCH_PERFMON_PERFCTR0, - .event_map = intel_pmu_event_map, - .raw_event = intel_pmu_raw_event, - .max_events = ARRAY_SIZE(intel_perfmon_event_map), - .apic = 1, - /* - * Intel PMCs cannot be accessed sanely above 32 bit width, - * so we install an artificial 1<<31 period regardless of - * the generic counter period: - */ - .max_period = (1ULL << 31) - 1, - .enable_bts = intel_pmu_enable_bts, - .disable_bts = intel_pmu_disable_bts, -}; - -static struct x86_pmu amd_pmu = { - .name = "AMD", - .handle_irq = amd_pmu_handle_irq, - .disable_all = amd_pmu_disable_all, - .enable_all = amd_pmu_enable_all, - .enable = amd_pmu_enable_counter, - .disable = amd_pmu_disable_counter, - .eventsel = MSR_K7_EVNTSEL0, - .perfctr = MSR_K7_PERFCTR0, - .event_map = amd_pmu_event_map, - .raw_event = amd_pmu_raw_event, - .max_events = ARRAY_SIZE(amd_perfmon_event_map), - .num_counters = 4, - .counter_bits = 48, - .counter_mask = (1ULL << 48) - 1, - .apic = 1, - /* use highest bit to detect overflow */ - .max_period = (1ULL << 47) - 1, -}; - -static int p6_pmu_init(void) -{ - switch (boot_cpu_data.x86_model) { - case 1: - case 3: /* Pentium Pro */ - case 5: - case 6: /* Pentium II */ - case 7: - case 8: - case 11: /* Pentium III */ - break; - case 9: - case 13: - /* Pentium M */ - break; - default: - pr_cont("unsupported p6 CPU model %d ", - boot_cpu_data.x86_model); - return -ENODEV; - } - - x86_pmu = p6_pmu; - - if (!cpu_has_apic) { - pr_info("no APIC, boot with the \"lapic\" boot parameter to force-enable it.\n"); - pr_info("no hardware sampling interrupt available.\n"); - x86_pmu.apic = 0; - } - - return 0; -} - -static int intel_pmu_init(void) -{ - union cpuid10_edx edx; - union cpuid10_eax eax; - unsigned int unused; - unsigned int ebx; - int version; - - if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) { - /* check for P6 processor family */ - if (boot_cpu_data.x86 == 6) { - return p6_pmu_init(); - } else { - return -ENODEV; - } - } - - /* - * Check whether the Architectural PerfMon supports - * Branch Misses Retired hw_event or not. - */ - cpuid(10, &eax.full, &ebx, &unused, &edx.full); - if (eax.split.mask_length <= ARCH_PERFMON_BRANCH_MISSES_RETIRED) - return -ENODEV; - - version = eax.split.version_id; - if (version < 2) - return -ENODEV; - - x86_pmu = intel_pmu; - x86_pmu.version = version; - x86_pmu.num_counters = eax.split.num_counters; - x86_pmu.counter_bits = eax.split.bit_width; - x86_pmu.counter_mask = (1ULL << eax.split.bit_width) - 1; - - /* - * Quirk: v2 perfmon does not report fixed-purpose counters, so - * assume at least 3 counters: - */ - x86_pmu.num_counters_fixed = max((int)edx.split.num_counters_fixed, 3); - - /* - * Install the hw-cache-events table: - */ - switch (boot_cpu_data.x86_model) { - case 15: /* original 65 nm celeron/pentium/core2/xeon, "Merom"/"Conroe" */ - case 22: /* single-core 65 nm celeron/core2solo "Merom-L"/"Conroe-L" */ - case 23: /* current 45 nm celeron/core2/xeon "Penryn"/"Wolfdale" */ - case 29: /* six-core 45 nm xeon "Dunnington" */ - memcpy(hw_cache_event_ids, core2_hw_cache_event_ids, - sizeof(hw_cache_event_ids)); - - pr_cont("Core2 events, "); - break; - default: - case 26: - memcpy(hw_cache_event_ids, nehalem_hw_cache_event_ids, - sizeof(hw_cache_event_ids)); - - pr_cont("Nehalem/Corei7 events, "); - break; - case 28: - memcpy(hw_cache_event_ids, atom_hw_cache_event_ids, - sizeof(hw_cache_event_ids)); - - pr_cont("Atom events, "); - break; - } - return 0; -} - -static int amd_pmu_init(void) -{ - /* Performance-monitoring supported from K7 and later: */ - if (boot_cpu_data.x86 < 6) - return -ENODEV; - - x86_pmu = amd_pmu; - - /* Events are common for all AMDs */ - memcpy(hw_cache_event_ids, amd_hw_cache_event_ids, - sizeof(hw_cache_event_ids)); - - return 0; -} - -void __init init_hw_perf_counters(void) -{ - int err; - - pr_info("Performance Counters: "); - - switch (boot_cpu_data.x86_vendor) { - case X86_VENDOR_INTEL: - err = intel_pmu_init(); - break; - case X86_VENDOR_AMD: - err = amd_pmu_init(); - break; - default: - return; - } - if (err != 0) { - pr_cont("no PMU driver, software counters only.\n"); - return; - } - - pr_cont("%s PMU driver.\n", x86_pmu.name); - - if (x86_pmu.num_counters > X86_PMC_MAX_GENERIC) { - WARN(1, KERN_ERR "hw perf counters %d > max(%d), clipping!", - x86_pmu.num_counters, X86_PMC_MAX_GENERIC); - x86_pmu.num_counters = X86_PMC_MAX_GENERIC; - } - perf_counter_mask = (1 << x86_pmu.num_counters) - 1; - perf_max_counters = x86_pmu.num_counters; - - if (x86_pmu.num_counters_fixed > X86_PMC_MAX_FIXED) { - WARN(1, KERN_ERR "hw perf counters fixed %d > max(%d), clipping!", - x86_pmu.num_counters_fixed, X86_PMC_MAX_FIXED); - x86_pmu.num_counters_fixed = X86_PMC_MAX_FIXED; - } - - perf_counter_mask |= - ((1LL << x86_pmu.num_counters_fixed)-1) << X86_PMC_IDX_FIXED; - x86_pmu.intel_ctrl = perf_counter_mask; - - perf_counters_lapic_init(); - register_die_notifier(&perf_counter_nmi_notifier); - - pr_info("... version: %d\n", x86_pmu.version); - pr_info("... bit width: %d\n", x86_pmu.counter_bits); - pr_info("... generic counters: %d\n", x86_pmu.num_counters); - pr_info("... value mask: %016Lx\n", x86_pmu.counter_mask); - pr_info("... max period: %016Lx\n", x86_pmu.max_period); - pr_info("... fixed-purpose counters: %d\n", x86_pmu.num_counters_fixed); - pr_info("... counter mask: %016Lx\n", perf_counter_mask); -} - -static inline void x86_pmu_read(struct perf_counter *counter) -{ - x86_perf_counter_update(counter, &counter->hw, counter->hw.idx); -} - -static const struct pmu pmu = { - .enable = x86_pmu_enable, - .disable = x86_pmu_disable, - .read = x86_pmu_read, - .unthrottle = x86_pmu_unthrottle, -}; - -const struct pmu *hw_perf_counter_init(struct perf_counter *counter) -{ - int err; - - err = __hw_perf_counter_init(counter); - if (err) { - if (counter->destroy) - counter->destroy(counter); - return ERR_PTR(err); - } - - return &pmu; -} - -/* - * callchain support - */ - -static inline -void callchain_store(struct perf_callchain_entry *entry, u64 ip) -{ - if (entry->nr < PERF_MAX_STACK_DEPTH) - entry->ip[entry->nr++] = ip; -} - -static DEFINE_PER_CPU(struct perf_callchain_entry, pmc_irq_entry); -static DEFINE_PER_CPU(struct perf_callchain_entry, pmc_nmi_entry); -static DEFINE_PER_CPU(int, in_nmi_frame); - - -static void -backtrace_warning_symbol(void *data, char *msg, unsigned long symbol) -{ - /* Ignore warnings */ -} - -static void backtrace_warning(void *data, char *msg) -{ - /* Ignore warnings */ -} - -static int backtrace_stack(void *data, char *name) -{ - per_cpu(in_nmi_frame, smp_processor_id()) = - x86_is_stack_id(NMI_STACK, name); - - return 0; -} - -static void backtrace_address(void *data, unsigned long addr, int reliable) -{ - struct perf_callchain_entry *entry = data; - - if (per_cpu(in_nmi_frame, smp_processor_id())) - return; - - if (reliable) - callchain_store(entry, addr); -} - -static const struct stacktrace_ops backtrace_ops = { - .warning = backtrace_warning, - .warning_symbol = backtrace_warning_symbol, - .stack = backtrace_stack, - .address = backtrace_address, -}; - -#include "../dumpstack.h" - -static void -perf_callchain_kernel(struct pt_regs *regs, struct perf_callchain_entry *entry) -{ - callchain_store(entry, PERF_CONTEXT_KERNEL); - callchain_store(entry, regs->ip); - - dump_trace(NULL, regs, NULL, 0, &backtrace_ops, entry); -} - -/* - * best effort, GUP based copy_from_user() that assumes IRQ or NMI context - */ -static unsigned long -copy_from_user_nmi(void *to, const void __user *from, unsigned long n) -{ - unsigned long offset, addr = (unsigned long)from; - int type = in_nmi() ? KM_NMI : KM_IRQ0; - unsigned long size, len = 0; - struct page *page; - void *map; - int ret; - - do { - ret = __get_user_pages_fast(addr, 1, 0, &page); - if (!ret) - break; - - offset = addr & (PAGE_SIZE - 1); - size = min(PAGE_SIZE - offset, n - len); - - map = kmap_atomic(page, type); - memcpy(to, map+offset, size); - kunmap_atomic(map, type); - put_page(page); - - len += size; - to += size; - addr += size; - - } while (len < n); - - return len; -} - -static int copy_stack_frame(const void __user *fp, struct stack_frame *frame) -{ - unsigned long bytes; - - bytes = copy_from_user_nmi(frame, fp, sizeof(*frame)); - - return bytes == sizeof(*frame); -} - -static void -perf_callchain_user(struct pt_regs *regs, struct perf_callchain_entry *entry) -{ - struct stack_frame frame; - const void __user *fp; - - if (!user_mode(regs)) - regs = task_pt_regs(current); - - fp = (void __user *)regs->bp; - - callchain_store(entry, PERF_CONTEXT_USER); - callchain_store(entry, regs->ip); - - while (entry->nr < PERF_MAX_STACK_DEPTH) { - frame.next_frame = NULL; - frame.return_address = 0; - - if (!copy_stack_frame(fp, &frame)) - break; - - if ((unsigned long)fp < regs->sp) - break; - - callchain_store(entry, frame.return_address); - fp = frame.next_frame; - } -} - -static void -perf_do_callchain(struct pt_regs *regs, struct perf_callchain_entry *entry) -{ - int is_user; - - if (!regs) - return; - - is_user = user_mode(regs); - - if (!current || current->pid == 0) - return; - - if (is_user && current->state != TASK_RUNNING) - return; - - if (!is_user) - perf_callchain_kernel(regs, entry); - - if (current->mm) - perf_callchain_user(regs, entry); -} - -struct perf_callchain_entry *perf_callchain(struct pt_regs *regs) -{ - struct perf_callchain_entry *entry; - - if (in_nmi()) - entry = &__get_cpu_var(pmc_nmi_entry); - else - entry = &__get_cpu_var(pmc_irq_entry); - - entry->nr = 0; - - perf_do_callchain(regs, entry); - - return entry; -} - -void hw_perf_counter_setup_online(int cpu) -{ - init_debug_store_on_cpu(cpu); -} diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c new file mode 100644 index 000000000000..0d03629fb1a5 --- /dev/null +++ b/arch/x86/kernel/cpu/perf_event.c @@ -0,0 +1,2298 @@ +/* + * Performance events x86 architecture code + * + * Copyright (C) 2008 Thomas Gleixner + * Copyright (C) 2008-2009 Red Hat, Inc., Ingo Molnar + * Copyright (C) 2009 Jaswinder Singh Rajput + * Copyright (C) 2009 Advanced Micro Devices, Inc., Robert Richter + * Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra + * Copyright (C) 2009 Intel Corporation, + * + * For licencing details see kernel-base/COPYING + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +static u64 perf_event_mask __read_mostly; + +/* The maximal number of PEBS events: */ +#define MAX_PEBS_EVENTS 4 + +/* The size of a BTS record in bytes: */ +#define BTS_RECORD_SIZE 24 + +/* The size of a per-cpu BTS buffer in bytes: */ +#define BTS_BUFFER_SIZE (BTS_RECORD_SIZE * 2048) + +/* The BTS overflow threshold in bytes from the end of the buffer: */ +#define BTS_OVFL_TH (BTS_RECORD_SIZE * 128) + + +/* + * Bits in the debugctlmsr controlling branch tracing. + */ +#define X86_DEBUGCTL_TR (1 << 6) +#define X86_DEBUGCTL_BTS (1 << 7) +#define X86_DEBUGCTL_BTINT (1 << 8) +#define X86_DEBUGCTL_BTS_OFF_OS (1 << 9) +#define X86_DEBUGCTL_BTS_OFF_USR (1 << 10) + +/* + * A debug store configuration. + * + * We only support architectures that use 64bit fields. + */ +struct debug_store { + u64 bts_buffer_base; + u64 bts_index; + u64 bts_absolute_maximum; + u64 bts_interrupt_threshold; + u64 pebs_buffer_base; + u64 pebs_index; + u64 pebs_absolute_maximum; + u64 pebs_interrupt_threshold; + u64 pebs_event_reset[MAX_PEBS_EVENTS]; +}; + +struct cpu_hw_events { + struct perf_event *events[X86_PMC_IDX_MAX]; + unsigned long used_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; + unsigned long active_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; + unsigned long interrupts; + int enabled; + struct debug_store *ds; +}; + +/* + * struct x86_pmu - generic x86 pmu + */ +struct x86_pmu { + const char *name; + int version; + int (*handle_irq)(struct pt_regs *); + void (*disable_all)(void); + void (*enable_all)(void); + void (*enable)(struct hw_perf_event *, int); + void (*disable)(struct hw_perf_event *, int); + unsigned eventsel; + unsigned perfctr; + u64 (*event_map)(int); + u64 (*raw_event)(u64); + int max_events; + int num_events; + int num_events_fixed; + int event_bits; + u64 event_mask; + int apic; + u64 max_period; + u64 intel_ctrl; + void (*enable_bts)(u64 config); + void (*disable_bts)(void); +}; + +static struct x86_pmu x86_pmu __read_mostly; + +static DEFINE_PER_CPU(struct cpu_hw_events, cpu_hw_events) = { + .enabled = 1, +}; + +/* + * Not sure about some of these + */ +static const u64 p6_perfmon_event_map[] = +{ + [PERF_COUNT_HW_CPU_CYCLES] = 0x0079, + [PERF_COUNT_HW_INSTRUCTIONS] = 0x00c0, + [PERF_COUNT_HW_CACHE_REFERENCES] = 0x0f2e, + [PERF_COUNT_HW_CACHE_MISSES] = 0x012e, + [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x00c4, + [PERF_COUNT_HW_BRANCH_MISSES] = 0x00c5, + [PERF_COUNT_HW_BUS_CYCLES] = 0x0062, +}; + +static u64 p6_pmu_event_map(int hw_event) +{ + return p6_perfmon_event_map[hw_event]; +} + +/* + * Event setting that is specified not to count anything. + * We use this to effectively disable a counter. + * + * L2_RQSTS with 0 MESI unit mask. + */ +#define P6_NOP_EVENT 0x0000002EULL + +static u64 p6_pmu_raw_event(u64 hw_event) +{ +#define P6_EVNTSEL_EVENT_MASK 0x000000FFULL +#define P6_EVNTSEL_UNIT_MASK 0x0000FF00ULL +#define P6_EVNTSEL_EDGE_MASK 0x00040000ULL +#define P6_EVNTSEL_INV_MASK 0x00800000ULL +#define P6_EVNTSEL_REG_MASK 0xFF000000ULL + +#define P6_EVNTSEL_MASK \ + (P6_EVNTSEL_EVENT_MASK | \ + P6_EVNTSEL_UNIT_MASK | \ + P6_EVNTSEL_EDGE_MASK | \ + P6_EVNTSEL_INV_MASK | \ + P6_EVNTSEL_REG_MASK) + + return hw_event & P6_EVNTSEL_MASK; +} + + +/* + * Intel PerfMon v3. Used on Core2 and later. + */ +static const u64 intel_perfmon_event_map[] = +{ + [PERF_COUNT_HW_CPU_CYCLES] = 0x003c, + [PERF_COUNT_HW_INSTRUCTIONS] = 0x00c0, + [PERF_COUNT_HW_CACHE_REFERENCES] = 0x4f2e, + [PERF_COUNT_HW_CACHE_MISSES] = 0x412e, + [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x00c4, + [PERF_COUNT_HW_BRANCH_MISSES] = 0x00c5, + [PERF_COUNT_HW_BUS_CYCLES] = 0x013c, +}; + +static u64 intel_pmu_event_map(int hw_event) +{ + return intel_perfmon_event_map[hw_event]; +} + +/* + * Generalized hw caching related hw_event table, filled + * in on a per model basis. A value of 0 means + * 'not supported', -1 means 'hw_event makes no sense on + * this CPU', any other value means the raw hw_event + * ID. + */ + +#define C(x) PERF_COUNT_HW_CACHE_##x + +static u64 __read_mostly hw_cache_event_ids + [PERF_COUNT_HW_CACHE_MAX] + [PERF_COUNT_HW_CACHE_OP_MAX] + [PERF_COUNT_HW_CACHE_RESULT_MAX]; + +static const u64 nehalem_hw_cache_event_ids + [PERF_COUNT_HW_CACHE_MAX] + [PERF_COUNT_HW_CACHE_OP_MAX] + [PERF_COUNT_HW_CACHE_RESULT_MAX] = +{ + [ C(L1D) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI */ + [ C(RESULT_MISS) ] = 0x0140, /* L1D_CACHE_LD.I_STATE */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI */ + [ C(RESULT_MISS) ] = 0x0141, /* L1D_CACHE_ST.I_STATE */ + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0x014e, /* L1D_PREFETCH.REQUESTS */ + [ C(RESULT_MISS) ] = 0x024e, /* L1D_PREFETCH.MISS */ + }, + }, + [ C(L1I ) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x0380, /* L1I.READS */ + [ C(RESULT_MISS) ] = 0x0280, /* L1I.MISSES */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0x0, + [ C(RESULT_MISS) ] = 0x0, + }, + }, + [ C(LL ) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x0324, /* L2_RQSTS.LOADS */ + [ C(RESULT_MISS) ] = 0x0224, /* L2_RQSTS.LD_MISS */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = 0x0c24, /* L2_RQSTS.RFOS */ + [ C(RESULT_MISS) ] = 0x0824, /* L2_RQSTS.RFO_MISS */ + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0x4f2e, /* LLC Reference */ + [ C(RESULT_MISS) ] = 0x412e, /* LLC Misses */ + }, + }, + [ C(DTLB) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI (alias) */ + [ C(RESULT_MISS) ] = 0x0108, /* DTLB_LOAD_MISSES.ANY */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI (alias) */ + [ C(RESULT_MISS) ] = 0x010c, /* MEM_STORE_RETIRED.DTLB_MISS */ + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0x0, + [ C(RESULT_MISS) ] = 0x0, + }, + }, + [ C(ITLB) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x01c0, /* INST_RETIRED.ANY_P */ + [ C(RESULT_MISS) ] = 0x20c8, /* ITLB_MISS_RETIRED */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + }, + [ C(BPU ) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ALL_BRANCHES */ + [ C(RESULT_MISS) ] = 0x03e8, /* BPU_CLEARS.ANY */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + }, +}; + +static const u64 core2_hw_cache_event_ids + [PERF_COUNT_HW_CACHE_MAX] + [PERF_COUNT_HW_CACHE_OP_MAX] + [PERF_COUNT_HW_CACHE_RESULT_MAX] = +{ + [ C(L1D) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI */ + [ C(RESULT_MISS) ] = 0x0140, /* L1D_CACHE_LD.I_STATE */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI */ + [ C(RESULT_MISS) ] = 0x0141, /* L1D_CACHE_ST.I_STATE */ + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0x104e, /* L1D_PREFETCH.REQUESTS */ + [ C(RESULT_MISS) ] = 0, + }, + }, + [ C(L1I ) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x0080, /* L1I.READS */ + [ C(RESULT_MISS) ] = 0x0081, /* L1I.MISSES */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0, + [ C(RESULT_MISS) ] = 0, + }, + }, + [ C(LL ) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x4f29, /* L2_LD.MESI */ + [ C(RESULT_MISS) ] = 0x4129, /* L2_LD.ISTATE */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = 0x4f2A, /* L2_ST.MESI */ + [ C(RESULT_MISS) ] = 0x412A, /* L2_ST.ISTATE */ + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0, + [ C(RESULT_MISS) ] = 0, + }, + }, + [ C(DTLB) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI (alias) */ + [ C(RESULT_MISS) ] = 0x0208, /* DTLB_MISSES.MISS_LD */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI (alias) */ + [ C(RESULT_MISS) ] = 0x0808, /* DTLB_MISSES.MISS_ST */ + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0, + [ C(RESULT_MISS) ] = 0, + }, + }, + [ C(ITLB) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x00c0, /* INST_RETIRED.ANY_P */ + [ C(RESULT_MISS) ] = 0x1282, /* ITLBMISSES */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + }, + [ C(BPU ) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ANY */ + [ C(RESULT_MISS) ] = 0x00c5, /* BP_INST_RETIRED.MISPRED */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + }, +}; + +static const u64 atom_hw_cache_event_ids + [PERF_COUNT_HW_CACHE_MAX] + [PERF_COUNT_HW_CACHE_OP_MAX] + [PERF_COUNT_HW_CACHE_RESULT_MAX] = +{ + [ C(L1D) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x2140, /* L1D_CACHE.LD */ + [ C(RESULT_MISS) ] = 0, + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = 0x2240, /* L1D_CACHE.ST */ + [ C(RESULT_MISS) ] = 0, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0x0, + [ C(RESULT_MISS) ] = 0, + }, + }, + [ C(L1I ) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x0380, /* L1I.READS */ + [ C(RESULT_MISS) ] = 0x0280, /* L1I.MISSES */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0, + [ C(RESULT_MISS) ] = 0, + }, + }, + [ C(LL ) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x4f29, /* L2_LD.MESI */ + [ C(RESULT_MISS) ] = 0x4129, /* L2_LD.ISTATE */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = 0x4f2A, /* L2_ST.MESI */ + [ C(RESULT_MISS) ] = 0x412A, /* L2_ST.ISTATE */ + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0, + [ C(RESULT_MISS) ] = 0, + }, + }, + [ C(DTLB) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x2140, /* L1D_CACHE_LD.MESI (alias) */ + [ C(RESULT_MISS) ] = 0x0508, /* DTLB_MISSES.MISS_LD */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = 0x2240, /* L1D_CACHE_ST.MESI (alias) */ + [ C(RESULT_MISS) ] = 0x0608, /* DTLB_MISSES.MISS_ST */ + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0, + [ C(RESULT_MISS) ] = 0, + }, + }, + [ C(ITLB) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x00c0, /* INST_RETIRED.ANY_P */ + [ C(RESULT_MISS) ] = 0x0282, /* ITLB.MISSES */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + }, + [ C(BPU ) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ANY */ + [ C(RESULT_MISS) ] = 0x00c5, /* BP_INST_RETIRED.MISPRED */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + }, +}; + +static u64 intel_pmu_raw_event(u64 hw_event) +{ +#define CORE_EVNTSEL_EVENT_MASK 0x000000FFULL +#define CORE_EVNTSEL_UNIT_MASK 0x0000FF00ULL +#define CORE_EVNTSEL_EDGE_MASK 0x00040000ULL +#define CORE_EVNTSEL_INV_MASK 0x00800000ULL +#define CORE_EVNTSEL_REG_MASK 0xFF000000ULL + +#define CORE_EVNTSEL_MASK \ + (CORE_EVNTSEL_EVENT_MASK | \ + CORE_EVNTSEL_UNIT_MASK | \ + CORE_EVNTSEL_EDGE_MASK | \ + CORE_EVNTSEL_INV_MASK | \ + CORE_EVNTSEL_REG_MASK) + + return hw_event & CORE_EVNTSEL_MASK; +} + +static const u64 amd_hw_cache_event_ids + [PERF_COUNT_HW_CACHE_MAX] + [PERF_COUNT_HW_CACHE_OP_MAX] + [PERF_COUNT_HW_CACHE_RESULT_MAX] = +{ + [ C(L1D) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x0040, /* Data Cache Accesses */ + [ C(RESULT_MISS) ] = 0x0041, /* Data Cache Misses */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = 0x0142, /* Data Cache Refills :system */ + [ C(RESULT_MISS) ] = 0, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0x0267, /* Data Prefetcher :attempts */ + [ C(RESULT_MISS) ] = 0x0167, /* Data Prefetcher :cancelled */ + }, + }, + [ C(L1I ) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x0080, /* Instruction cache fetches */ + [ C(RESULT_MISS) ] = 0x0081, /* Instruction cache misses */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0x014B, /* Prefetch Instructions :Load */ + [ C(RESULT_MISS) ] = 0, + }, + }, + [ C(LL ) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x037D, /* Requests to L2 Cache :IC+DC */ + [ C(RESULT_MISS) ] = 0x037E, /* L2 Cache Misses : IC+DC */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = 0x017F, /* L2 Fill/Writeback */ + [ C(RESULT_MISS) ] = 0, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0, + [ C(RESULT_MISS) ] = 0, + }, + }, + [ C(DTLB) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x0040, /* Data Cache Accesses */ + [ C(RESULT_MISS) ] = 0x0046, /* L1 DTLB and L2 DLTB Miss */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = 0, + [ C(RESULT_MISS) ] = 0, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0, + [ C(RESULT_MISS) ] = 0, + }, + }, + [ C(ITLB) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x0080, /* Instruction fecthes */ + [ C(RESULT_MISS) ] = 0x0085, /* Instr. fetch ITLB misses */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + }, + [ C(BPU ) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x00c2, /* Retired Branch Instr. */ + [ C(RESULT_MISS) ] = 0x00c3, /* Retired Mispredicted BI */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + }, +}; + +/* + * AMD Performance Monitor K7 and later. + */ +static const u64 amd_perfmon_event_map[] = +{ + [PERF_COUNT_HW_CPU_CYCLES] = 0x0076, + [PERF_COUNT_HW_INSTRUCTIONS] = 0x00c0, + [PERF_COUNT_HW_CACHE_REFERENCES] = 0x0080, + [PERF_COUNT_HW_CACHE_MISSES] = 0x0081, + [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x00c4, + [PERF_COUNT_HW_BRANCH_MISSES] = 0x00c5, +}; + +static u64 amd_pmu_event_map(int hw_event) +{ + return amd_perfmon_event_map[hw_event]; +} + +static u64 amd_pmu_raw_event(u64 hw_event) +{ +#define K7_EVNTSEL_EVENT_MASK 0x7000000FFULL +#define K7_EVNTSEL_UNIT_MASK 0x00000FF00ULL +#define K7_EVNTSEL_EDGE_MASK 0x000040000ULL +#define K7_EVNTSEL_INV_MASK 0x000800000ULL +#define K7_EVNTSEL_REG_MASK 0x0FF000000ULL + +#define K7_EVNTSEL_MASK \ + (K7_EVNTSEL_EVENT_MASK | \ + K7_EVNTSEL_UNIT_MASK | \ + K7_EVNTSEL_EDGE_MASK | \ + K7_EVNTSEL_INV_MASK | \ + K7_EVNTSEL_REG_MASK) + + return hw_event & K7_EVNTSEL_MASK; +} + +/* + * Propagate event elapsed time into the generic event. + * Can only be executed on the CPU where the event is active. + * Returns the delta events processed. + */ +static u64 +x86_perf_event_update(struct perf_event *event, + struct hw_perf_event *hwc, int idx) +{ + int shift = 64 - x86_pmu.event_bits; + u64 prev_raw_count, new_raw_count; + s64 delta; + + if (idx == X86_PMC_IDX_FIXED_BTS) + return 0; + + /* + * Careful: an NMI might modify the previous event value. + * + * Our tactic to handle this is to first atomically read and + * exchange a new raw count - then add that new-prev delta + * count to the generic event atomically: + */ +again: + prev_raw_count = atomic64_read(&hwc->prev_count); + rdmsrl(hwc->event_base + idx, new_raw_count); + + if (atomic64_cmpxchg(&hwc->prev_count, prev_raw_count, + new_raw_count) != prev_raw_count) + goto again; + + /* + * Now we have the new raw value and have updated the prev + * timestamp already. We can now calculate the elapsed delta + * (event-)time and add that to the generic event. + * + * Careful, not all hw sign-extends above the physical width + * of the count. + */ + delta = (new_raw_count << shift) - (prev_raw_count << shift); + delta >>= shift; + + atomic64_add(delta, &event->count); + atomic64_sub(delta, &hwc->period_left); + + return new_raw_count; +} + +static atomic_t active_events; +static DEFINE_MUTEX(pmc_reserve_mutex); + +static bool reserve_pmc_hardware(void) +{ +#ifdef CONFIG_X86_LOCAL_APIC + int i; + + if (nmi_watchdog == NMI_LOCAL_APIC) + disable_lapic_nmi_watchdog(); + + for (i = 0; i < x86_pmu.num_events; i++) { + if (!reserve_perfctr_nmi(x86_pmu.perfctr + i)) + goto perfctr_fail; + } + + for (i = 0; i < x86_pmu.num_events; i++) { + if (!reserve_evntsel_nmi(x86_pmu.eventsel + i)) + goto eventsel_fail; + } +#endif + + return true; + +#ifdef CONFIG_X86_LOCAL_APIC +eventsel_fail: + for (i--; i >= 0; i--) + release_evntsel_nmi(x86_pmu.eventsel + i); + + i = x86_pmu.num_events; + +perfctr_fail: + for (i--; i >= 0; i--) + release_perfctr_nmi(x86_pmu.perfctr + i); + + if (nmi_watchdog == NMI_LOCAL_APIC) + enable_lapic_nmi_watchdog(); + + return false; +#endif +} + +static void release_pmc_hardware(void) +{ +#ifdef CONFIG_X86_LOCAL_APIC + int i; + + for (i = 0; i < x86_pmu.num_events; i++) { + release_perfctr_nmi(x86_pmu.perfctr + i); + release_evntsel_nmi(x86_pmu.eventsel + i); + } + + if (nmi_watchdog == NMI_LOCAL_APIC) + enable_lapic_nmi_watchdog(); +#endif +} + +static inline bool bts_available(void) +{ + return x86_pmu.enable_bts != NULL; +} + +static inline void init_debug_store_on_cpu(int cpu) +{ + struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds; + + if (!ds) + return; + + wrmsr_on_cpu(cpu, MSR_IA32_DS_AREA, + (u32)((u64)(unsigned long)ds), + (u32)((u64)(unsigned long)ds >> 32)); +} + +static inline void fini_debug_store_on_cpu(int cpu) +{ + if (!per_cpu(cpu_hw_events, cpu).ds) + return; + + wrmsr_on_cpu(cpu, MSR_IA32_DS_AREA, 0, 0); +} + +static void release_bts_hardware(void) +{ + int cpu; + + if (!bts_available()) + return; + + get_online_cpus(); + + for_each_online_cpu(cpu) + fini_debug_store_on_cpu(cpu); + + for_each_possible_cpu(cpu) { + struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds; + + if (!ds) + continue; + + per_cpu(cpu_hw_events, cpu).ds = NULL; + + kfree((void *)(unsigned long)ds->bts_buffer_base); + kfree(ds); + } + + put_online_cpus(); +} + +static int reserve_bts_hardware(void) +{ + int cpu, err = 0; + + if (!bts_available()) + return 0; + + get_online_cpus(); + + for_each_possible_cpu(cpu) { + struct debug_store *ds; + void *buffer; + + err = -ENOMEM; + buffer = kzalloc(BTS_BUFFER_SIZE, GFP_KERNEL); + if (unlikely(!buffer)) + break; + + ds = kzalloc(sizeof(*ds), GFP_KERNEL); + if (unlikely(!ds)) { + kfree(buffer); + break; + } + + ds->bts_buffer_base = (u64)(unsigned long)buffer; + ds->bts_index = ds->bts_buffer_base; + ds->bts_absolute_maximum = + ds->bts_buffer_base + BTS_BUFFER_SIZE; + ds->bts_interrupt_threshold = + ds->bts_absolute_maximum - BTS_OVFL_TH; + + per_cpu(cpu_hw_events, cpu).ds = ds; + err = 0; + } + + if (err) + release_bts_hardware(); + else { + for_each_online_cpu(cpu) + init_debug_store_on_cpu(cpu); + } + + put_online_cpus(); + + return err; +} + +static void hw_perf_event_destroy(struct perf_event *event) +{ + if (atomic_dec_and_mutex_lock(&active_events, &pmc_reserve_mutex)) { + release_pmc_hardware(); + release_bts_hardware(); + mutex_unlock(&pmc_reserve_mutex); + } +} + +static inline int x86_pmu_initialized(void) +{ + return x86_pmu.handle_irq != NULL; +} + +static inline int +set_ext_hw_attr(struct hw_perf_event *hwc, struct perf_event_attr *attr) +{ + unsigned int cache_type, cache_op, cache_result; + u64 config, val; + + config = attr->config; + + cache_type = (config >> 0) & 0xff; + if (cache_type >= PERF_COUNT_HW_CACHE_MAX) + return -EINVAL; + + cache_op = (config >> 8) & 0xff; + if (cache_op >= PERF_COUNT_HW_CACHE_OP_MAX) + return -EINVAL; + + cache_result = (config >> 16) & 0xff; + if (cache_result >= PERF_COUNT_HW_CACHE_RESULT_MAX) + return -EINVAL; + + val = hw_cache_event_ids[cache_type][cache_op][cache_result]; + + if (val == 0) + return -ENOENT; + + if (val == -1) + return -EINVAL; + + hwc->config |= val; + + return 0; +} + +static void intel_pmu_enable_bts(u64 config) +{ + unsigned long debugctlmsr; + + debugctlmsr = get_debugctlmsr(); + + debugctlmsr |= X86_DEBUGCTL_TR; + debugctlmsr |= X86_DEBUGCTL_BTS; + debugctlmsr |= X86_DEBUGCTL_BTINT; + + if (!(config & ARCH_PERFMON_EVENTSEL_OS)) + debugctlmsr |= X86_DEBUGCTL_BTS_OFF_OS; + + if (!(config & ARCH_PERFMON_EVENTSEL_USR)) + debugctlmsr |= X86_DEBUGCTL_BTS_OFF_USR; + + update_debugctlmsr(debugctlmsr); +} + +static void intel_pmu_disable_bts(void) +{ + struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); + unsigned long debugctlmsr; + + if (!cpuc->ds) + return; + + debugctlmsr = get_debugctlmsr(); + + debugctlmsr &= + ~(X86_DEBUGCTL_TR | X86_DEBUGCTL_BTS | X86_DEBUGCTL_BTINT | + X86_DEBUGCTL_BTS_OFF_OS | X86_DEBUGCTL_BTS_OFF_USR); + + update_debugctlmsr(debugctlmsr); +} + +/* + * Setup the hardware configuration for a given attr_type + */ +static int __hw_perf_event_init(struct perf_event *event) +{ + struct perf_event_attr *attr = &event->attr; + struct hw_perf_event *hwc = &event->hw; + u64 config; + int err; + + if (!x86_pmu_initialized()) + return -ENODEV; + + err = 0; + if (!atomic_inc_not_zero(&active_events)) { + mutex_lock(&pmc_reserve_mutex); + if (atomic_read(&active_events) == 0) { + if (!reserve_pmc_hardware()) + err = -EBUSY; + else + err = reserve_bts_hardware(); + } + if (!err) + atomic_inc(&active_events); + mutex_unlock(&pmc_reserve_mutex); + } + if (err) + return err; + + event->destroy = hw_perf_event_destroy; + + /* + * Generate PMC IRQs: + * (keep 'enabled' bit clear for now) + */ + hwc->config = ARCH_PERFMON_EVENTSEL_INT; + + /* + * Count user and OS events unless requested not to. + */ + if (!attr->exclude_user) + hwc->config |= ARCH_PERFMON_EVENTSEL_USR; + if (!attr->exclude_kernel) + hwc->config |= ARCH_PERFMON_EVENTSEL_OS; + + if (!hwc->sample_period) { + hwc->sample_period = x86_pmu.max_period; + hwc->last_period = hwc->sample_period; + atomic64_set(&hwc->period_left, hwc->sample_period); + } else { + /* + * If we have a PMU initialized but no APIC + * interrupts, we cannot sample hardware + * events (user-space has to fall back and + * sample via a hrtimer based software event): + */ + if (!x86_pmu.apic) + return -EOPNOTSUPP; + } + + /* + * Raw hw_event type provide the config in the hw_event structure + */ + if (attr->type == PERF_TYPE_RAW) { + hwc->config |= x86_pmu.raw_event(attr->config); + return 0; + } + + if (attr->type == PERF_TYPE_HW_CACHE) + return set_ext_hw_attr(hwc, attr); + + if (attr->config >= x86_pmu.max_events) + return -EINVAL; + + /* + * The generic map: + */ + config = x86_pmu.event_map(attr->config); + + if (config == 0) + return -ENOENT; + + if (config == -1LL) + return -EINVAL; + + /* + * Branch tracing: + */ + if ((attr->config == PERF_COUNT_HW_BRANCH_INSTRUCTIONS) && + (hwc->sample_period == 1)) { + /* BTS is not supported by this architecture. */ + if (!bts_available()) + return -EOPNOTSUPP; + + /* BTS is currently only allowed for user-mode. */ + if (hwc->config & ARCH_PERFMON_EVENTSEL_OS) + return -EOPNOTSUPP; + } + + hwc->config |= config; + + return 0; +} + +static void p6_pmu_disable_all(void) +{ + struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); + u64 val; + + if (!cpuc->enabled) + return; + + cpuc->enabled = 0; + barrier(); + + /* p6 only has one enable register */ + rdmsrl(MSR_P6_EVNTSEL0, val); + val &= ~ARCH_PERFMON_EVENTSEL0_ENABLE; + wrmsrl(MSR_P6_EVNTSEL0, val); +} + +static void intel_pmu_disable_all(void) +{ + struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); + + if (!cpuc->enabled) + return; + + cpuc->enabled = 0; + barrier(); + + wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0); + + if (test_bit(X86_PMC_IDX_FIXED_BTS, cpuc->active_mask)) + intel_pmu_disable_bts(); +} + +static void amd_pmu_disable_all(void) +{ + struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); + int idx; + + if (!cpuc->enabled) + return; + + cpuc->enabled = 0; + /* + * ensure we write the disable before we start disabling the + * events proper, so that amd_pmu_enable_event() does the + * right thing. + */ + barrier(); + + for (idx = 0; idx < x86_pmu.num_events; idx++) { + u64 val; + + if (!test_bit(idx, cpuc->active_mask)) + continue; + rdmsrl(MSR_K7_EVNTSEL0 + idx, val); + if (!(val & ARCH_PERFMON_EVENTSEL0_ENABLE)) + continue; + val &= ~ARCH_PERFMON_EVENTSEL0_ENABLE; + wrmsrl(MSR_K7_EVNTSEL0 + idx, val); + } +} + +void hw_perf_disable(void) +{ + if (!x86_pmu_initialized()) + return; + return x86_pmu.disable_all(); +} + +static void p6_pmu_enable_all(void) +{ + struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); + unsigned long val; + + if (cpuc->enabled) + return; + + cpuc->enabled = 1; + barrier(); + + /* p6 only has one enable register */ + rdmsrl(MSR_P6_EVNTSEL0, val); + val |= ARCH_PERFMON_EVENTSEL0_ENABLE; + wrmsrl(MSR_P6_EVNTSEL0, val); +} + +static void intel_pmu_enable_all(void) +{ + struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); + + if (cpuc->enabled) + return; + + cpuc->enabled = 1; + barrier(); + + wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, x86_pmu.intel_ctrl); + + if (test_bit(X86_PMC_IDX_FIXED_BTS, cpuc->active_mask)) { + struct perf_event *event = + cpuc->events[X86_PMC_IDX_FIXED_BTS]; + + if (WARN_ON_ONCE(!event)) + return; + + intel_pmu_enable_bts(event->hw.config); + } +} + +static void amd_pmu_enable_all(void) +{ + struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); + int idx; + + if (cpuc->enabled) + return; + + cpuc->enabled = 1; + barrier(); + + for (idx = 0; idx < x86_pmu.num_events; idx++) { + struct perf_event *event = cpuc->events[idx]; + u64 val; + + if (!test_bit(idx, cpuc->active_mask)) + continue; + + val = event->hw.config; + val |= ARCH_PERFMON_EVENTSEL0_ENABLE; + wrmsrl(MSR_K7_EVNTSEL0 + idx, val); + } +} + +void hw_perf_enable(void) +{ + if (!x86_pmu_initialized()) + return; + x86_pmu.enable_all(); +} + +static inline u64 intel_pmu_get_status(void) +{ + u64 status; + + rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status); + + return status; +} + +static inline void intel_pmu_ack_status(u64 ack) +{ + wrmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, ack); +} + +static inline void x86_pmu_enable_event(struct hw_perf_event *hwc, int idx) +{ + (void)checking_wrmsrl(hwc->config_base + idx, + hwc->config | ARCH_PERFMON_EVENTSEL0_ENABLE); +} + +static inline void x86_pmu_disable_event(struct hw_perf_event *hwc, int idx) +{ + (void)checking_wrmsrl(hwc->config_base + idx, hwc->config); +} + +static inline void +intel_pmu_disable_fixed(struct hw_perf_event *hwc, int __idx) +{ + int idx = __idx - X86_PMC_IDX_FIXED; + u64 ctrl_val, mask; + + mask = 0xfULL << (idx * 4); + + rdmsrl(hwc->config_base, ctrl_val); + ctrl_val &= ~mask; + (void)checking_wrmsrl(hwc->config_base, ctrl_val); +} + +static inline void +p6_pmu_disable_event(struct hw_perf_event *hwc, int idx) +{ + struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); + u64 val = P6_NOP_EVENT; + + if (cpuc->enabled) + val |= ARCH_PERFMON_EVENTSEL0_ENABLE; + + (void)checking_wrmsrl(hwc->config_base + idx, val); +} + +static inline void +intel_pmu_disable_event(struct hw_perf_event *hwc, int idx) +{ + if (unlikely(idx == X86_PMC_IDX_FIXED_BTS)) { + intel_pmu_disable_bts(); + return; + } + + if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) { + intel_pmu_disable_fixed(hwc, idx); + return; + } + + x86_pmu_disable_event(hwc, idx); +} + +static inline void +amd_pmu_disable_event(struct hw_perf_event *hwc, int idx) +{ + x86_pmu_disable_event(hwc, idx); +} + +static DEFINE_PER_CPU(u64 [X86_PMC_IDX_MAX], pmc_prev_left); + +/* + * Set the next IRQ period, based on the hwc->period_left value. + * To be called with the event disabled in hw: + */ +static int +x86_perf_event_set_period(struct perf_event *event, + struct hw_perf_event *hwc, int idx) +{ + s64 left = atomic64_read(&hwc->period_left); + s64 period = hwc->sample_period; + int err, ret = 0; + + if (idx == X86_PMC_IDX_FIXED_BTS) + return 0; + + /* + * If we are way outside a reasoable range then just skip forward: + */ + if (unlikely(left <= -period)) { + left = period; + atomic64_set(&hwc->period_left, left); + hwc->last_period = period; + ret = 1; + } + + if (unlikely(left <= 0)) { + left += period; + atomic64_set(&hwc->period_left, left); + hwc->last_period = period; + ret = 1; + } + /* + * Quirk: certain CPUs dont like it if just 1 hw_event is left: + */ + if (unlikely(left < 2)) + left = 2; + + if (left > x86_pmu.max_period) + left = x86_pmu.max_period; + + per_cpu(pmc_prev_left[idx], smp_processor_id()) = left; + + /* + * The hw event starts counting from this event offset, + * mark it to be able to extra future deltas: + */ + atomic64_set(&hwc->prev_count, (u64)-left); + + err = checking_wrmsrl(hwc->event_base + idx, + (u64)(-left) & x86_pmu.event_mask); + + perf_event_update_userpage(event); + + return ret; +} + +static inline void +intel_pmu_enable_fixed(struct hw_perf_event *hwc, int __idx) +{ + int idx = __idx - X86_PMC_IDX_FIXED; + u64 ctrl_val, bits, mask; + int err; + + /* + * Enable IRQ generation (0x8), + * and enable ring-3 counting (0x2) and ring-0 counting (0x1) + * if requested: + */ + bits = 0x8ULL; + if (hwc->config & ARCH_PERFMON_EVENTSEL_USR) + bits |= 0x2; + if (hwc->config & ARCH_PERFMON_EVENTSEL_OS) + bits |= 0x1; + bits <<= (idx * 4); + mask = 0xfULL << (idx * 4); + + rdmsrl(hwc->config_base, ctrl_val); + ctrl_val &= ~mask; + ctrl_val |= bits; + err = checking_wrmsrl(hwc->config_base, ctrl_val); +} + +static void p6_pmu_enable_event(struct hw_perf_event *hwc, int idx) +{ + struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); + u64 val; + + val = hwc->config; + if (cpuc->enabled) + val |= ARCH_PERFMON_EVENTSEL0_ENABLE; + + (void)checking_wrmsrl(hwc->config_base + idx, val); +} + + +static void intel_pmu_enable_event(struct hw_perf_event *hwc, int idx) +{ + if (unlikely(idx == X86_PMC_IDX_FIXED_BTS)) { + if (!__get_cpu_var(cpu_hw_events).enabled) + return; + + intel_pmu_enable_bts(hwc->config); + return; + } + + if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) { + intel_pmu_enable_fixed(hwc, idx); + return; + } + + x86_pmu_enable_event(hwc, idx); +} + +static void amd_pmu_enable_event(struct hw_perf_event *hwc, int idx) +{ + struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); + + if (cpuc->enabled) + x86_pmu_enable_event(hwc, idx); +} + +static int +fixed_mode_idx(struct perf_event *event, struct hw_perf_event *hwc) +{ + unsigned int hw_event; + + hw_event = hwc->config & ARCH_PERFMON_EVENT_MASK; + + if (unlikely((hw_event == + x86_pmu.event_map(PERF_COUNT_HW_BRANCH_INSTRUCTIONS)) && + (hwc->sample_period == 1))) + return X86_PMC_IDX_FIXED_BTS; + + if (!x86_pmu.num_events_fixed) + return -1; + + if (unlikely(hw_event == x86_pmu.event_map(PERF_COUNT_HW_INSTRUCTIONS))) + return X86_PMC_IDX_FIXED_INSTRUCTIONS; + if (unlikely(hw_event == x86_pmu.event_map(PERF_COUNT_HW_CPU_CYCLES))) + return X86_PMC_IDX_FIXED_CPU_CYCLES; + if (unlikely(hw_event == x86_pmu.event_map(PERF_COUNT_HW_BUS_CYCLES))) + return X86_PMC_IDX_FIXED_BUS_CYCLES; + + return -1; +} + +/* + * Find a PMC slot for the freshly enabled / scheduled in event: + */ +static int x86_pmu_enable(struct perf_event *event) +{ + struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); + struct hw_perf_event *hwc = &event->hw; + int idx; + + idx = fixed_mode_idx(event, hwc); + if (idx == X86_PMC_IDX_FIXED_BTS) { + /* BTS is already occupied. */ + if (test_and_set_bit(idx, cpuc->used_mask)) + return -EAGAIN; + + hwc->config_base = 0; + hwc->event_base = 0; + hwc->idx = idx; + } else if (idx >= 0) { + /* + * Try to get the fixed event, if that is already taken + * then try to get a generic event: + */ + if (test_and_set_bit(idx, cpuc->used_mask)) + goto try_generic; + + hwc->config_base = MSR_ARCH_PERFMON_FIXED_CTR_CTRL; + /* + * We set it so that event_base + idx in wrmsr/rdmsr maps to + * MSR_ARCH_PERFMON_FIXED_CTR0 ... CTR2: + */ + hwc->event_base = + MSR_ARCH_PERFMON_FIXED_CTR0 - X86_PMC_IDX_FIXED; + hwc->idx = idx; + } else { + idx = hwc->idx; + /* Try to get the previous generic event again */ + if (test_and_set_bit(idx, cpuc->used_mask)) { +try_generic: + idx = find_first_zero_bit(cpuc->used_mask, + x86_pmu.num_events); + if (idx == x86_pmu.num_events) + return -EAGAIN; + + set_bit(idx, cpuc->used_mask); + hwc->idx = idx; + } + hwc->config_base = x86_pmu.eventsel; + hwc->event_base = x86_pmu.perfctr; + } + + perf_events_lapic_init(); + + x86_pmu.disable(hwc, idx); + + cpuc->events[idx] = event; + set_bit(idx, cpuc->active_mask); + + x86_perf_event_set_period(event, hwc, idx); + x86_pmu.enable(hwc, idx); + + perf_event_update_userpage(event); + + return 0; +} + +static void x86_pmu_unthrottle(struct perf_event *event) +{ + struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); + struct hw_perf_event *hwc = &event->hw; + + if (WARN_ON_ONCE(hwc->idx >= X86_PMC_IDX_MAX || + cpuc->events[hwc->idx] != event)) + return; + + x86_pmu.enable(hwc, hwc->idx); +} + +void perf_event_print_debug(void) +{ + u64 ctrl, status, overflow, pmc_ctrl, pmc_count, prev_left, fixed; + struct cpu_hw_events *cpuc; + unsigned long flags; + int cpu, idx; + + if (!x86_pmu.num_events) + return; + + local_irq_save(flags); + + cpu = smp_processor_id(); + cpuc = &per_cpu(cpu_hw_events, cpu); + + if (x86_pmu.version >= 2) { + rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, ctrl); + rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status); + rdmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, overflow); + rdmsrl(MSR_ARCH_PERFMON_FIXED_CTR_CTRL, fixed); + + pr_info("\n"); + pr_info("CPU#%d: ctrl: %016llx\n", cpu, ctrl); + pr_info("CPU#%d: status: %016llx\n", cpu, status); + pr_info("CPU#%d: overflow: %016llx\n", cpu, overflow); + pr_info("CPU#%d: fixed: %016llx\n", cpu, fixed); + } + pr_info("CPU#%d: used: %016llx\n", cpu, *(u64 *)cpuc->used_mask); + + for (idx = 0; idx < x86_pmu.num_events; idx++) { + rdmsrl(x86_pmu.eventsel + idx, pmc_ctrl); + rdmsrl(x86_pmu.perfctr + idx, pmc_count); + + prev_left = per_cpu(pmc_prev_left[idx], cpu); + + pr_info("CPU#%d: gen-PMC%d ctrl: %016llx\n", + cpu, idx, pmc_ctrl); + pr_info("CPU#%d: gen-PMC%d count: %016llx\n", + cpu, idx, pmc_count); + pr_info("CPU#%d: gen-PMC%d left: %016llx\n", + cpu, idx, prev_left); + } + for (idx = 0; idx < x86_pmu.num_events_fixed; idx++) { + rdmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, pmc_count); + + pr_info("CPU#%d: fixed-PMC%d count: %016llx\n", + cpu, idx, pmc_count); + } + local_irq_restore(flags); +} + +static void intel_pmu_drain_bts_buffer(struct cpu_hw_events *cpuc) +{ + struct debug_store *ds = cpuc->ds; + struct bts_record { + u64 from; + u64 to; + u64 flags; + }; + struct perf_event *event = cpuc->events[X86_PMC_IDX_FIXED_BTS]; + struct bts_record *at, *top; + struct perf_output_handle handle; + struct perf_event_header header; + struct perf_sample_data data; + struct pt_regs regs; + + if (!event) + return; + + if (!ds) + return; + + at = (struct bts_record *)(unsigned long)ds->bts_buffer_base; + top = (struct bts_record *)(unsigned long)ds->bts_index; + + if (top <= at) + return; + + ds->bts_index = ds->bts_buffer_base; + + + data.period = event->hw.last_period; + data.addr = 0; + regs.ip = 0; + + /* + * Prepare a generic sample, i.e. fill in the invariant fields. + * We will overwrite the from and to address before we output + * the sample. + */ + perf_prepare_sample(&header, &data, event, ®s); + + if (perf_output_begin(&handle, event, + header.size * (top - at), 1, 1)) + return; + + for (; at < top; at++) { + data.ip = at->from; + data.addr = at->to; + + perf_output_sample(&handle, &header, &data, event); + } + + perf_output_end(&handle); + + /* There's new data available. */ + event->hw.interrupts++; + event->pending_kill = POLL_IN; +} + +static void x86_pmu_disable(struct perf_event *event) +{ + struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); + struct hw_perf_event *hwc = &event->hw; + int idx = hwc->idx; + + /* + * Must be done before we disable, otherwise the nmi handler + * could reenable again: + */ + clear_bit(idx, cpuc->active_mask); + x86_pmu.disable(hwc, idx); + + /* + * Make sure the cleared pointer becomes visible before we + * (potentially) free the event: + */ + barrier(); + + /* + * Drain the remaining delta count out of a event + * that we are disabling: + */ + x86_perf_event_update(event, hwc, idx); + + /* Drain the remaining BTS records. */ + if (unlikely(idx == X86_PMC_IDX_FIXED_BTS)) + intel_pmu_drain_bts_buffer(cpuc); + + cpuc->events[idx] = NULL; + clear_bit(idx, cpuc->used_mask); + + perf_event_update_userpage(event); +} + +/* + * Save and restart an expired event. Called by NMI contexts, + * so it has to be careful about preempting normal event ops: + */ +static int intel_pmu_save_and_restart(struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + int idx = hwc->idx; + int ret; + + x86_perf_event_update(event, hwc, idx); + ret = x86_perf_event_set_period(event, hwc, idx); + + if (event->state == PERF_EVENT_STATE_ACTIVE) + intel_pmu_enable_event(hwc, idx); + + return ret; +} + +static void intel_pmu_reset(void) +{ + struct debug_store *ds = __get_cpu_var(cpu_hw_events).ds; + unsigned long flags; + int idx; + + if (!x86_pmu.num_events) + return; + + local_irq_save(flags); + + printk("clearing PMU state on CPU#%d\n", smp_processor_id()); + + for (idx = 0; idx < x86_pmu.num_events; idx++) { + checking_wrmsrl(x86_pmu.eventsel + idx, 0ull); + checking_wrmsrl(x86_pmu.perfctr + idx, 0ull); + } + for (idx = 0; idx < x86_pmu.num_events_fixed; idx++) { + checking_wrmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, 0ull); + } + if (ds) + ds->bts_index = ds->bts_buffer_base; + + local_irq_restore(flags); +} + +static int p6_pmu_handle_irq(struct pt_regs *regs) +{ + struct perf_sample_data data; + struct cpu_hw_events *cpuc; + struct perf_event *event; + struct hw_perf_event *hwc; + int idx, handled = 0; + u64 val; + + data.addr = 0; + + cpuc = &__get_cpu_var(cpu_hw_events); + + for (idx = 0; idx < x86_pmu.num_events; idx++) { + if (!test_bit(idx, cpuc->active_mask)) + continue; + + event = cpuc->events[idx]; + hwc = &event->hw; + + val = x86_perf_event_update(event, hwc, idx); + if (val & (1ULL << (x86_pmu.event_bits - 1))) + continue; + + /* + * event overflow + */ + handled = 1; + data.period = event->hw.last_period; + + if (!x86_perf_event_set_period(event, hwc, idx)) + continue; + + if (perf_event_overflow(event, 1, &data, regs)) + p6_pmu_disable_event(hwc, idx); + } + + if (handled) + inc_irq_stat(apic_perf_irqs); + + return handled; +} + +/* + * This handler is triggered by the local APIC, so the APIC IRQ handling + * rules apply: + */ +static int intel_pmu_handle_irq(struct pt_regs *regs) +{ + struct perf_sample_data data; + struct cpu_hw_events *cpuc; + int bit, loops; + u64 ack, status; + + data.addr = 0; + + cpuc = &__get_cpu_var(cpu_hw_events); + + perf_disable(); + intel_pmu_drain_bts_buffer(cpuc); + status = intel_pmu_get_status(); + if (!status) { + perf_enable(); + return 0; + } + + loops = 0; +again: + if (++loops > 100) { + WARN_ONCE(1, "perfevents: irq loop stuck!\n"); + perf_event_print_debug(); + intel_pmu_reset(); + perf_enable(); + return 1; + } + + inc_irq_stat(apic_perf_irqs); + ack = status; + for_each_bit(bit, (unsigned long *)&status, X86_PMC_IDX_MAX) { + struct perf_event *event = cpuc->events[bit]; + + clear_bit(bit, (unsigned long *) &status); + if (!test_bit(bit, cpuc->active_mask)) + continue; + + if (!intel_pmu_save_and_restart(event)) + continue; + + data.period = event->hw.last_period; + + if (perf_event_overflow(event, 1, &data, regs)) + intel_pmu_disable_event(&event->hw, bit); + } + + intel_pmu_ack_status(ack); + + /* + * Repeat if there is more work to be done: + */ + status = intel_pmu_get_status(); + if (status) + goto again; + + perf_enable(); + + return 1; +} + +static int amd_pmu_handle_irq(struct pt_regs *regs) +{ + struct perf_sample_data data; + struct cpu_hw_events *cpuc; + struct perf_event *event; + struct hw_perf_event *hwc; + int idx, handled = 0; + u64 val; + + data.addr = 0; + + cpuc = &__get_cpu_var(cpu_hw_events); + + for (idx = 0; idx < x86_pmu.num_events; idx++) { + if (!test_bit(idx, cpuc->active_mask)) + continue; + + event = cpuc->events[idx]; + hwc = &event->hw; + + val = x86_perf_event_update(event, hwc, idx); + if (val & (1ULL << (x86_pmu.event_bits - 1))) + continue; + + /* + * event overflow + */ + handled = 1; + data.period = event->hw.last_period; + + if (!x86_perf_event_set_period(event, hwc, idx)) + continue; + + if (perf_event_overflow(event, 1, &data, regs)) + amd_pmu_disable_event(hwc, idx); + } + + if (handled) + inc_irq_stat(apic_perf_irqs); + + return handled; +} + +void smp_perf_pending_interrupt(struct pt_regs *regs) +{ + irq_enter(); + ack_APIC_irq(); + inc_irq_stat(apic_pending_irqs); + perf_event_do_pending(); + irq_exit(); +} + +void set_perf_event_pending(void) +{ +#ifdef CONFIG_X86_LOCAL_APIC + apic->send_IPI_self(LOCAL_PENDING_VECTOR); +#endif +} + +void perf_events_lapic_init(void) +{ +#ifdef CONFIG_X86_LOCAL_APIC + if (!x86_pmu.apic || !x86_pmu_initialized()) + return; + + /* + * Always use NMI for PMU + */ + apic_write(APIC_LVTPC, APIC_DM_NMI); +#endif +} + +static int __kprobes +perf_event_nmi_handler(struct notifier_block *self, + unsigned long cmd, void *__args) +{ + struct die_args *args = __args; + struct pt_regs *regs; + + if (!atomic_read(&active_events)) + return NOTIFY_DONE; + + switch (cmd) { + case DIE_NMI: + case DIE_NMI_IPI: + break; + + default: + return NOTIFY_DONE; + } + + regs = args->regs; + +#ifdef CONFIG_X86_LOCAL_APIC + apic_write(APIC_LVTPC, APIC_DM_NMI); +#endif + /* + * Can't rely on the handled return value to say it was our NMI, two + * events could trigger 'simultaneously' raising two back-to-back NMIs. + * + * If the first NMI handles both, the latter will be empty and daze + * the CPU. + */ + x86_pmu.handle_irq(regs); + + return NOTIFY_STOP; +} + +static __read_mostly struct notifier_block perf_event_nmi_notifier = { + .notifier_call = perf_event_nmi_handler, + .next = NULL, + .priority = 1 +}; + +static struct x86_pmu p6_pmu = { + .name = "p6", + .handle_irq = p6_pmu_handle_irq, + .disable_all = p6_pmu_disable_all, + .enable_all = p6_pmu_enable_all, + .enable = p6_pmu_enable_event, + .disable = p6_pmu_disable_event, + .eventsel = MSR_P6_EVNTSEL0, + .perfctr = MSR_P6_PERFCTR0, + .event_map = p6_pmu_event_map, + .raw_event = p6_pmu_raw_event, + .max_events = ARRAY_SIZE(p6_perfmon_event_map), + .apic = 1, + .max_period = (1ULL << 31) - 1, + .version = 0, + .num_events = 2, + /* + * Events have 40 bits implemented. However they are designed such + * that bits [32-39] are sign extensions of bit 31. As such the + * effective width of a event for P6-like PMU is 32 bits only. + * + * See IA-32 Intel Architecture Software developer manual Vol 3B + */ + .event_bits = 32, + .event_mask = (1ULL << 32) - 1, +}; + +static struct x86_pmu intel_pmu = { + .name = "Intel", + .handle_irq = intel_pmu_handle_irq, + .disable_all = intel_pmu_disable_all, + .enable_all = intel_pmu_enable_all, + .enable = intel_pmu_enable_event, + .disable = intel_pmu_disable_event, + .eventsel = MSR_ARCH_PERFMON_EVENTSEL0, + .perfctr = MSR_ARCH_PERFMON_PERFCTR0, + .event_map = intel_pmu_event_map, + .raw_event = intel_pmu_raw_event, + .max_events = ARRAY_SIZE(intel_perfmon_event_map), + .apic = 1, + /* + * Intel PMCs cannot be accessed sanely above 32 bit width, + * so we install an artificial 1<<31 period regardless of + * the generic event period: + */ + .max_period = (1ULL << 31) - 1, + .enable_bts = intel_pmu_enable_bts, + .disable_bts = intel_pmu_disable_bts, +}; + +static struct x86_pmu amd_pmu = { + .name = "AMD", + .handle_irq = amd_pmu_handle_irq, + .disable_all = amd_pmu_disable_all, + .enable_all = amd_pmu_enable_all, + .enable = amd_pmu_enable_event, + .disable = amd_pmu_disable_event, + .eventsel = MSR_K7_EVNTSEL0, + .perfctr = MSR_K7_PERFCTR0, + .event_map = amd_pmu_event_map, + .raw_event = amd_pmu_raw_event, + .max_events = ARRAY_SIZE(amd_perfmon_event_map), + .num_events = 4, + .event_bits = 48, + .event_mask = (1ULL << 48) - 1, + .apic = 1, + /* use highest bit to detect overflow */ + .max_period = (1ULL << 47) - 1, +}; + +static int p6_pmu_init(void) +{ + switch (boot_cpu_data.x86_model) { + case 1: + case 3: /* Pentium Pro */ + case 5: + case 6: /* Pentium II */ + case 7: + case 8: + case 11: /* Pentium III */ + break; + case 9: + case 13: + /* Pentium M */ + break; + default: + pr_cont("unsupported p6 CPU model %d ", + boot_cpu_data.x86_model); + return -ENODEV; + } + + x86_pmu = p6_pmu; + + if (!cpu_has_apic) { + pr_info("no APIC, boot with the \"lapic\" boot parameter to force-enable it.\n"); + pr_info("no hardware sampling interrupt available.\n"); + x86_pmu.apic = 0; + } + + return 0; +} + +static int intel_pmu_init(void) +{ + union cpuid10_edx edx; + union cpuid10_eax eax; + unsigned int unused; + unsigned int ebx; + int version; + + if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) { + /* check for P6 processor family */ + if (boot_cpu_data.x86 == 6) { + return p6_pmu_init(); + } else { + return -ENODEV; + } + } + + /* + * Check whether the Architectural PerfMon supports + * Branch Misses Retired hw_event or not. + */ + cpuid(10, &eax.full, &ebx, &unused, &edx.full); + if (eax.split.mask_length <= ARCH_PERFMON_BRANCH_MISSES_RETIRED) + return -ENODEV; + + version = eax.split.version_id; + if (version < 2) + return -ENODEV; + + x86_pmu = intel_pmu; + x86_pmu.version = version; + x86_pmu.num_events = eax.split.num_events; + x86_pmu.event_bits = eax.split.bit_width; + x86_pmu.event_mask = (1ULL << eax.split.bit_width) - 1; + + /* + * Quirk: v2 perfmon does not report fixed-purpose events, so + * assume at least 3 events: + */ + x86_pmu.num_events_fixed = max((int)edx.split.num_events_fixed, 3); + + /* + * Install the hw-cache-events table: + */ + switch (boot_cpu_data.x86_model) { + case 15: /* original 65 nm celeron/pentium/core2/xeon, "Merom"/"Conroe" */ + case 22: /* single-core 65 nm celeron/core2solo "Merom-L"/"Conroe-L" */ + case 23: /* current 45 nm celeron/core2/xeon "Penryn"/"Wolfdale" */ + case 29: /* six-core 45 nm xeon "Dunnington" */ + memcpy(hw_cache_event_ids, core2_hw_cache_event_ids, + sizeof(hw_cache_event_ids)); + + pr_cont("Core2 events, "); + break; + default: + case 26: + memcpy(hw_cache_event_ids, nehalem_hw_cache_event_ids, + sizeof(hw_cache_event_ids)); + + pr_cont("Nehalem/Corei7 events, "); + break; + case 28: + memcpy(hw_cache_event_ids, atom_hw_cache_event_ids, + sizeof(hw_cache_event_ids)); + + pr_cont("Atom events, "); + break; + } + return 0; +} + +static int amd_pmu_init(void) +{ + /* Performance-monitoring supported from K7 and later: */ + if (boot_cpu_data.x86 < 6) + return -ENODEV; + + x86_pmu = amd_pmu; + + /* Events are common for all AMDs */ + memcpy(hw_cache_event_ids, amd_hw_cache_event_ids, + sizeof(hw_cache_event_ids)); + + return 0; +} + +void __init init_hw_perf_events(void) +{ + int err; + + pr_info("Performance Events: "); + + switch (boot_cpu_data.x86_vendor) { + case X86_VENDOR_INTEL: + err = intel_pmu_init(); + break; + case X86_VENDOR_AMD: + err = amd_pmu_init(); + break; + default: + return; + } + if (err != 0) { + pr_cont("no PMU driver, software events only.\n"); + return; + } + + pr_cont("%s PMU driver.\n", x86_pmu.name); + + if (x86_pmu.num_events > X86_PMC_MAX_GENERIC) { + WARN(1, KERN_ERR "hw perf events %d > max(%d), clipping!", + x86_pmu.num_events, X86_PMC_MAX_GENERIC); + x86_pmu.num_events = X86_PMC_MAX_GENERIC; + } + perf_event_mask = (1 << x86_pmu.num_events) - 1; + perf_max_events = x86_pmu.num_events; + + if (x86_pmu.num_events_fixed > X86_PMC_MAX_FIXED) { + WARN(1, KERN_ERR "hw perf events fixed %d > max(%d), clipping!", + x86_pmu.num_events_fixed, X86_PMC_MAX_FIXED); + x86_pmu.num_events_fixed = X86_PMC_MAX_FIXED; + } + + perf_event_mask |= + ((1LL << x86_pmu.num_events_fixed)-1) << X86_PMC_IDX_FIXED; + x86_pmu.intel_ctrl = perf_event_mask; + + perf_events_lapic_init(); + register_die_notifier(&perf_event_nmi_notifier); + + pr_info("... version: %d\n", x86_pmu.version); + pr_info("... bit width: %d\n", x86_pmu.event_bits); + pr_info("... generic events: %d\n", x86_pmu.num_events); + pr_info("... value mask: %016Lx\n", x86_pmu.event_mask); + pr_info("... max period: %016Lx\n", x86_pmu.max_period); + pr_info("... fixed-purpose events: %d\n", x86_pmu.num_events_fixed); + pr_info("... event mask: %016Lx\n", perf_event_mask); +} + +static inline void x86_pmu_read(struct perf_event *event) +{ + x86_perf_event_update(event, &event->hw, event->hw.idx); +} + +static const struct pmu pmu = { + .enable = x86_pmu_enable, + .disable = x86_pmu_disable, + .read = x86_pmu_read, + .unthrottle = x86_pmu_unthrottle, +}; + +const struct pmu *hw_perf_event_init(struct perf_event *event) +{ + int err; + + err = __hw_perf_event_init(event); + if (err) { + if (event->destroy) + event->destroy(event); + return ERR_PTR(err); + } + + return &pmu; +} + +/* + * callchain support + */ + +static inline +void callchain_store(struct perf_callchain_entry *entry, u64 ip) +{ + if (entry->nr < PERF_MAX_STACK_DEPTH) + entry->ip[entry->nr++] = ip; +} + +static DEFINE_PER_CPU(struct perf_callchain_entry, pmc_irq_entry); +static DEFINE_PER_CPU(struct perf_callchain_entry, pmc_nmi_entry); +static DEFINE_PER_CPU(int, in_nmi_frame); + + +static void +backtrace_warning_symbol(void *data, char *msg, unsigned long symbol) +{ + /* Ignore warnings */ +} + +static void backtrace_warning(void *data, char *msg) +{ + /* Ignore warnings */ +} + +static int backtrace_stack(void *data, char *name) +{ + per_cpu(in_nmi_frame, smp_processor_id()) = + x86_is_stack_id(NMI_STACK, name); + + return 0; +} + +static void backtrace_address(void *data, unsigned long addr, int reliable) +{ + struct perf_callchain_entry *entry = data; + + if (per_cpu(in_nmi_frame, smp_processor_id())) + return; + + if (reliable) + callchain_store(entry, addr); +} + +static const struct stacktrace_ops backtrace_ops = { + .warning = backtrace_warning, + .warning_symbol = backtrace_warning_symbol, + .stack = backtrace_stack, + .address = backtrace_address, +}; + +#include "../dumpstack.h" + +static void +perf_callchain_kernel(struct pt_regs *regs, struct perf_callchain_entry *entry) +{ + callchain_store(entry, PERF_CONTEXT_KERNEL); + callchain_store(entry, regs->ip); + + dump_trace(NULL, regs, NULL, 0, &backtrace_ops, entry); +} + +/* + * best effort, GUP based copy_from_user() that assumes IRQ or NMI context + */ +static unsigned long +copy_from_user_nmi(void *to, const void __user *from, unsigned long n) +{ + unsigned long offset, addr = (unsigned long)from; + int type = in_nmi() ? KM_NMI : KM_IRQ0; + unsigned long size, len = 0; + struct page *page; + void *map; + int ret; + + do { + ret = __get_user_pages_fast(addr, 1, 0, &page); + if (!ret) + break; + + offset = addr & (PAGE_SIZE - 1); + size = min(PAGE_SIZE - offset, n - len); + + map = kmap_atomic(page, type); + memcpy(to, map+offset, size); + kunmap_atomic(map, type); + put_page(page); + + len += size; + to += size; + addr += size; + + } while (len < n); + + return len; +} + +static int copy_stack_frame(const void __user *fp, struct stack_frame *frame) +{ + unsigned long bytes; + + bytes = copy_from_user_nmi(frame, fp, sizeof(*frame)); + + return bytes == sizeof(*frame); +} + +static void +perf_callchain_user(struct pt_regs *regs, struct perf_callchain_entry *entry) +{ + struct stack_frame frame; + const void __user *fp; + + if (!user_mode(regs)) + regs = task_pt_regs(current); + + fp = (void __user *)regs->bp; + + callchain_store(entry, PERF_CONTEXT_USER); + callchain_store(entry, regs->ip); + + while (entry->nr < PERF_MAX_STACK_DEPTH) { + frame.next_frame = NULL; + frame.return_address = 0; + + if (!copy_stack_frame(fp, &frame)) + break; + + if ((unsigned long)fp < regs->sp) + break; + + callchain_store(entry, frame.return_address); + fp = frame.next_frame; + } +} + +static void +perf_do_callchain(struct pt_regs *regs, struct perf_callchain_entry *entry) +{ + int is_user; + + if (!regs) + return; + + is_user = user_mode(regs); + + if (!current || current->pid == 0) + return; + + if (is_user && current->state != TASK_RUNNING) + return; + + if (!is_user) + perf_callchain_kernel(regs, entry); + + if (current->mm) + perf_callchain_user(regs, entry); +} + +struct perf_callchain_entry *perf_callchain(struct pt_regs *regs) +{ + struct perf_callchain_entry *entry; + + if (in_nmi()) + entry = &__get_cpu_var(pmc_nmi_entry); + else + entry = &__get_cpu_var(pmc_irq_entry); + + entry->nr = 0; + + perf_do_callchain(regs, entry); + + return entry; +} + +void hw_perf_event_setup_online(int cpu) +{ + init_debug_store_on_cpu(cpu); +} diff --git a/arch/x86/kernel/cpu/perfctr-watchdog.c b/arch/x86/kernel/cpu/perfctr-watchdog.c index 392bea43b890..fab786f60ed6 100644 --- a/arch/x86/kernel/cpu/perfctr-watchdog.c +++ b/arch/x86/kernel/cpu/perfctr-watchdog.c @@ -20,7 +20,7 @@ #include #include -#include +#include struct nmi_watchdog_ctlblk { unsigned int cccr_msr; diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index d59fe323807e..681c3fda7391 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -1021,7 +1021,7 @@ apicinterrupt ERROR_APIC_VECTOR \ apicinterrupt SPURIOUS_APIC_VECTOR \ spurious_interrupt smp_spurious_interrupt -#ifdef CONFIG_PERF_COUNTERS +#ifdef CONFIG_PERF_EVENTS apicinterrupt LOCAL_PENDING_VECTOR \ perf_pending_interrupt smp_perf_pending_interrupt #endif diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c index 300883112e3d..40f30773fb29 100644 --- a/arch/x86/kernel/irqinit.c +++ b/arch/x86/kernel/irqinit.c @@ -208,7 +208,7 @@ static void __init apic_intr_init(void) alloc_intr_gate(ERROR_APIC_VECTOR, error_interrupt); /* Performance monitoring interrupts: */ -# ifdef CONFIG_PERF_COUNTERS +# ifdef CONFIG_PERF_EVENTS alloc_intr_gate(LOCAL_PENDING_VECTOR, perf_pending_interrupt); # endif diff --git a/arch/x86/kernel/syscall_table_32.S b/arch/x86/kernel/syscall_table_32.S index d51321ddafda..0157cd26d7cc 100644 --- a/arch/x86/kernel/syscall_table_32.S +++ b/arch/x86/kernel/syscall_table_32.S @@ -335,4 +335,4 @@ ENTRY(sys_call_table) .long sys_preadv .long sys_pwritev .long sys_rt_tgsigqueueinfo /* 335 */ - .long sys_perf_counter_open + .long sys_perf_event_open diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 775a020990a5..82728f2c6d55 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -10,7 +10,7 @@ #include /* max_low_pfn */ #include /* __kprobes, ... */ #include /* kmmio_handler, ... */ -#include /* perf_swcounter_event */ +#include /* perf_sw_event */ #include /* dotraplinkage, ... */ #include /* pgd_*(), ... */ @@ -1017,7 +1017,7 @@ do_page_fault(struct pt_regs *regs, unsigned long error_code) if (unlikely(error_code & PF_RSVD)) pgtable_bad(regs, error_code, address); - perf_swcounter_event(PERF_COUNT_SW_PAGE_FAULTS, 1, 0, regs, address); + perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, 0, regs, address); /* * If we're in an interrupt, have no user context or are running @@ -1114,11 +1114,11 @@ good_area: if (fault & VM_FAULT_MAJOR) { tsk->maj_flt++; - perf_swcounter_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, 0, + perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, 0, regs, address); } else { tsk->min_flt++; - perf_swcounter_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, 0, + perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, 0, regs, address); } diff --git a/arch/x86/oprofile/op_model_ppro.c b/arch/x86/oprofile/op_model_ppro.c index 4899215999de..8eb05878554c 100644 --- a/arch/x86/oprofile/op_model_ppro.c +++ b/arch/x86/oprofile/op_model_ppro.c @@ -234,11 +234,11 @@ static void arch_perfmon_setup_counters(void) if (eax.split.version_id == 0 && current_cpu_data.x86 == 6 && current_cpu_data.x86_model == 15) { eax.split.version_id = 2; - eax.split.num_counters = 2; + eax.split.num_events = 2; eax.split.bit_width = 40; } - num_counters = eax.split.num_counters; + num_counters = eax.split.num_events; op_arch_perfmon_spec.num_counters = num_counters; op_arch_perfmon_spec.num_controls = num_counters; diff --git a/arch/x86/oprofile/op_x86_model.h b/arch/x86/oprofile/op_x86_model.h index b83776180c7f..7b8e75d16081 100644 --- a/arch/x86/oprofile/op_x86_model.h +++ b/arch/x86/oprofile/op_x86_model.h @@ -13,7 +13,7 @@ #define OP_X86_MODEL_H #include -#include +#include struct op_msr { unsigned long addr; diff --git a/drivers/char/sysrq.c b/drivers/char/sysrq.c index 50eecfe1d724..44203ff599da 100644 --- a/drivers/char/sysrq.c +++ b/drivers/char/sysrq.c @@ -26,7 +26,7 @@ #include #include #include -#include +#include #include #include #include @@ -252,7 +252,7 @@ static void sysrq_handle_showregs(int key, struct tty_struct *tty) struct pt_regs *regs = get_irq_regs(); if (regs) show_regs(regs); - perf_counter_print_debug(); + perf_event_print_debug(); } static struct sysrq_key_op sysrq_showregs_op = { .handler = sysrq_handle_showregs, diff --git a/fs/exec.c b/fs/exec.c index 172ceb6edde4..434dba778ccc 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -33,7 +33,7 @@ #include #include #include -#include +#include #include #include #include @@ -923,7 +923,7 @@ void set_task_comm(struct task_struct *tsk, char *buf) task_lock(tsk); strlcpy(tsk->comm, buf, sizeof(tsk->comm)); task_unlock(tsk); - perf_counter_comm(tsk); + perf_event_comm(tsk); } int flush_old_exec(struct linux_binprm * bprm) @@ -997,7 +997,7 @@ int flush_old_exec(struct linux_binprm * bprm) * security domain: */ if (!get_dumpable(current->mm)) - perf_counter_exit_task(current); + perf_event_exit_task(current); /* An exec changes our domain. We are no longer part of the thread group */ diff --git a/include/asm-generic/unistd.h b/include/asm-generic/unistd.h index 1125e5a1ee5d..d76b66acea95 100644 --- a/include/asm-generic/unistd.h +++ b/include/asm-generic/unistd.h @@ -620,8 +620,8 @@ __SYSCALL(__NR_move_pages, sys_move_pages) #define __NR_rt_tgsigqueueinfo 240 __SYSCALL(__NR_rt_tgsigqueueinfo, sys_rt_tgsigqueueinfo) -#define __NR_perf_counter_open 241 -__SYSCALL(__NR_perf_counter_open, sys_perf_counter_open) +#define __NR_perf_event_open 241 +__SYSCALL(__NR_perf_event_open, sys_perf_event_open) #undef __NR_syscalls #define __NR_syscalls 242 diff --git a/include/linux/init_task.h b/include/linux/init_task.h index 9e7f2e8fc66e..21a6f5d9af22 100644 --- a/include/linux/init_task.h +++ b/include/linux/init_task.h @@ -106,13 +106,13 @@ extern struct group_info init_groups; extern struct cred init_cred; -#ifdef CONFIG_PERF_COUNTERS -# define INIT_PERF_COUNTERS(tsk) \ - .perf_counter_mutex = \ - __MUTEX_INITIALIZER(tsk.perf_counter_mutex), \ - .perf_counter_list = LIST_HEAD_INIT(tsk.perf_counter_list), +#ifdef CONFIG_PERF_EVENTS +# define INIT_PERF_EVENTS(tsk) \ + .perf_event_mutex = \ + __MUTEX_INITIALIZER(tsk.perf_event_mutex), \ + .perf_event_list = LIST_HEAD_INIT(tsk.perf_event_list), #else -# define INIT_PERF_COUNTERS(tsk) +# define INIT_PERF_EVENTS(tsk) #endif /* @@ -178,7 +178,7 @@ extern struct cred init_cred; }, \ .dirties = INIT_PROP_LOCAL_SINGLE(dirties), \ INIT_IDS \ - INIT_PERF_COUNTERS(tsk) \ + INIT_PERF_EVENTS(tsk) \ INIT_TRACE_IRQFLAGS \ INIT_LOCKDEP \ INIT_FTRACE_GRAPH \ diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h deleted file mode 100644 index f64862732673..000000000000 --- a/include/linux/perf_counter.h +++ /dev/null @@ -1,858 +0,0 @@ -/* - * Performance counters: - * - * Copyright (C) 2008-2009, Thomas Gleixner - * Copyright (C) 2008-2009, Red Hat, Inc., Ingo Molnar - * Copyright (C) 2008-2009, Red Hat, Inc., Peter Zijlstra - * - * Data type definitions, declarations, prototypes. - * - * Started by: Thomas Gleixner and Ingo Molnar - * - * For licencing details see kernel-base/COPYING - */ -#ifndef _LINUX_PERF_COUNTER_H -#define _LINUX_PERF_COUNTER_H - -#include -#include -#include - -/* - * User-space ABI bits: - */ - -/* - * attr.type - */ -enum perf_type_id { - PERF_TYPE_HARDWARE = 0, - PERF_TYPE_SOFTWARE = 1, - PERF_TYPE_TRACEPOINT = 2, - PERF_TYPE_HW_CACHE = 3, - PERF_TYPE_RAW = 4, - - PERF_TYPE_MAX, /* non-ABI */ -}; - -/* - * Generalized performance counter event types, used by the - * attr.event_id parameter of the sys_perf_counter_open() - * syscall: - */ -enum perf_hw_id { - /* - * Common hardware events, generalized by the kernel: - */ - PERF_COUNT_HW_CPU_CYCLES = 0, - PERF_COUNT_HW_INSTRUCTIONS = 1, - PERF_COUNT_HW_CACHE_REFERENCES = 2, - PERF_COUNT_HW_CACHE_MISSES = 3, - PERF_COUNT_HW_BRANCH_INSTRUCTIONS = 4, - PERF_COUNT_HW_BRANCH_MISSES = 5, - PERF_COUNT_HW_BUS_CYCLES = 6, - - PERF_COUNT_HW_MAX, /* non-ABI */ -}; - -/* - * Generalized hardware cache counters: - * - * { L1-D, L1-I, LLC, ITLB, DTLB, BPU } x - * { read, write, prefetch } x - * { accesses, misses } - */ -enum perf_hw_cache_id { - PERF_COUNT_HW_CACHE_L1D = 0, - PERF_COUNT_HW_CACHE_L1I = 1, - PERF_COUNT_HW_CACHE_LL = 2, - PERF_COUNT_HW_CACHE_DTLB = 3, - PERF_COUNT_HW_CACHE_ITLB = 4, - PERF_COUNT_HW_CACHE_BPU = 5, - - PERF_COUNT_HW_CACHE_MAX, /* non-ABI */ -}; - -enum perf_hw_cache_op_id { - PERF_COUNT_HW_CACHE_OP_READ = 0, - PERF_COUNT_HW_CACHE_OP_WRITE = 1, - PERF_COUNT_HW_CACHE_OP_PREFETCH = 2, - - PERF_COUNT_HW_CACHE_OP_MAX, /* non-ABI */ -}; - -enum perf_hw_cache_op_result_id { - PERF_COUNT_HW_CACHE_RESULT_ACCESS = 0, - PERF_COUNT_HW_CACHE_RESULT_MISS = 1, - - PERF_COUNT_HW_CACHE_RESULT_MAX, /* non-ABI */ -}; - -/* - * Special "software" counters provided by the kernel, even if the hardware - * does not support performance counters. These counters measure various - * physical and sw events of the kernel (and allow the profiling of them as - * well): - */ -enum perf_sw_ids { - PERF_COUNT_SW_CPU_CLOCK = 0, - PERF_COUNT_SW_TASK_CLOCK = 1, - PERF_COUNT_SW_PAGE_FAULTS = 2, - PERF_COUNT_SW_CONTEXT_SWITCHES = 3, - PERF_COUNT_SW_CPU_MIGRATIONS = 4, - PERF_COUNT_SW_PAGE_FAULTS_MIN = 5, - PERF_COUNT_SW_PAGE_FAULTS_MAJ = 6, - - PERF_COUNT_SW_MAX, /* non-ABI */ -}; - -/* - * Bits that can be set in attr.sample_type to request information - * in the overflow packets. - */ -enum perf_counter_sample_format { - PERF_SAMPLE_IP = 1U << 0, - PERF_SAMPLE_TID = 1U << 1, - PERF_SAMPLE_TIME = 1U << 2, - PERF_SAMPLE_ADDR = 1U << 3, - PERF_SAMPLE_READ = 1U << 4, - PERF_SAMPLE_CALLCHAIN = 1U << 5, - PERF_SAMPLE_ID = 1U << 6, - PERF_SAMPLE_CPU = 1U << 7, - PERF_SAMPLE_PERIOD = 1U << 8, - PERF_SAMPLE_STREAM_ID = 1U << 9, - PERF_SAMPLE_RAW = 1U << 10, - - PERF_SAMPLE_MAX = 1U << 11, /* non-ABI */ -}; - -/* - * The format of the data returned by read() on a perf counter fd, - * as specified by attr.read_format: - * - * struct read_format { - * { u64 value; - * { u64 time_enabled; } && PERF_FORMAT_ENABLED - * { u64 time_running; } && PERF_FORMAT_RUNNING - * { u64 id; } && PERF_FORMAT_ID - * } && !PERF_FORMAT_GROUP - * - * { u64 nr; - * { u64 time_enabled; } && PERF_FORMAT_ENABLED - * { u64 time_running; } && PERF_FORMAT_RUNNING - * { u64 value; - * { u64 id; } && PERF_FORMAT_ID - * } cntr[nr]; - * } && PERF_FORMAT_GROUP - * }; - */ -enum perf_counter_read_format { - PERF_FORMAT_TOTAL_TIME_ENABLED = 1U << 0, - PERF_FORMAT_TOTAL_TIME_RUNNING = 1U << 1, - PERF_FORMAT_ID = 1U << 2, - PERF_FORMAT_GROUP = 1U << 3, - - PERF_FORMAT_MAX = 1U << 4, /* non-ABI */ -}; - -#define PERF_ATTR_SIZE_VER0 64 /* sizeof first published struct */ - -/* - * Hardware event to monitor via a performance monitoring counter: - */ -struct perf_counter_attr { - - /* - * Major type: hardware/software/tracepoint/etc. - */ - __u32 type; - - /* - * Size of the attr structure, for fwd/bwd compat. - */ - __u32 size; - - /* - * Type specific configuration information. - */ - __u64 config; - - union { - __u64 sample_period; - __u64 sample_freq; - }; - - __u64 sample_type; - __u64 read_format; - - __u64 disabled : 1, /* off by default */ - inherit : 1, /* children inherit it */ - pinned : 1, /* must always be on PMU */ - exclusive : 1, /* only group on PMU */ - exclude_user : 1, /* don't count user */ - exclude_kernel : 1, /* ditto kernel */ - exclude_hv : 1, /* ditto hypervisor */ - exclude_idle : 1, /* don't count when idle */ - mmap : 1, /* include mmap data */ - comm : 1, /* include comm data */ - freq : 1, /* use freq, not period */ - inherit_stat : 1, /* per task counts */ - enable_on_exec : 1, /* next exec enables */ - task : 1, /* trace fork/exit */ - watermark : 1, /* wakeup_watermark */ - - __reserved_1 : 49; - - union { - __u32 wakeup_events; /* wakeup every n events */ - __u32 wakeup_watermark; /* bytes before wakeup */ - }; - __u32 __reserved_2; - - __u64 __reserved_3; -}; - -/* - * Ioctls that can be done on a perf counter fd: - */ -#define PERF_COUNTER_IOC_ENABLE _IO ('$', 0) -#define PERF_COUNTER_IOC_DISABLE _IO ('$', 1) -#define PERF_COUNTER_IOC_REFRESH _IO ('$', 2) -#define PERF_COUNTER_IOC_RESET _IO ('$', 3) -#define PERF_COUNTER_IOC_PERIOD _IOW('$', 4, u64) -#define PERF_COUNTER_IOC_SET_OUTPUT _IO ('$', 5) - -enum perf_counter_ioc_flags { - PERF_IOC_FLAG_GROUP = 1U << 0, -}; - -/* - * Structure of the page that can be mapped via mmap - */ -struct perf_counter_mmap_page { - __u32 version; /* version number of this structure */ - __u32 compat_version; /* lowest version this is compat with */ - - /* - * Bits needed to read the hw counters in user-space. - * - * u32 seq; - * s64 count; - * - * do { - * seq = pc->lock; - * - * barrier() - * if (pc->index) { - * count = pmc_read(pc->index - 1); - * count += pc->offset; - * } else - * goto regular_read; - * - * barrier(); - * } while (pc->lock != seq); - * - * NOTE: for obvious reason this only works on self-monitoring - * processes. - */ - __u32 lock; /* seqlock for synchronization */ - __u32 index; /* hardware counter identifier */ - __s64 offset; /* add to hardware counter value */ - __u64 time_enabled; /* time counter active */ - __u64 time_running; /* time counter on cpu */ - - /* - * Hole for extension of the self monitor capabilities - */ - - __u64 __reserved[123]; /* align to 1k */ - - /* - * Control data for the mmap() data buffer. - * - * User-space reading the @data_head value should issue an rmb(), on - * SMP capable platforms, after reading this value -- see - * perf_counter_wakeup(). - * - * When the mapping is PROT_WRITE the @data_tail value should be - * written by userspace to reflect the last read data. In this case - * the kernel will not over-write unread data. - */ - __u64 data_head; /* head in the data section */ - __u64 data_tail; /* user-space written tail */ -}; - -#define PERF_EVENT_MISC_CPUMODE_MASK (3 << 0) -#define PERF_EVENT_MISC_CPUMODE_UNKNOWN (0 << 0) -#define PERF_EVENT_MISC_KERNEL (1 << 0) -#define PERF_EVENT_MISC_USER (2 << 0) -#define PERF_EVENT_MISC_HYPERVISOR (3 << 0) - -struct perf_event_header { - __u32 type; - __u16 misc; - __u16 size; -}; - -enum perf_event_type { - - /* - * The MMAP events record the PROT_EXEC mappings so that we can - * correlate userspace IPs to code. They have the following structure: - * - * struct { - * struct perf_event_header header; - * - * u32 pid, tid; - * u64 addr; - * u64 len; - * u64 pgoff; - * char filename[]; - * }; - */ - PERF_EVENT_MMAP = 1, - - /* - * struct { - * struct perf_event_header header; - * u64 id; - * u64 lost; - * }; - */ - PERF_EVENT_LOST = 2, - - /* - * struct { - * struct perf_event_header header; - * - * u32 pid, tid; - * char comm[]; - * }; - */ - PERF_EVENT_COMM = 3, - - /* - * struct { - * struct perf_event_header header; - * u32 pid, ppid; - * u32 tid, ptid; - * u64 time; - * }; - */ - PERF_EVENT_EXIT = 4, - - /* - * struct { - * struct perf_event_header header; - * u64 time; - * u64 id; - * u64 stream_id; - * }; - */ - PERF_EVENT_THROTTLE = 5, - PERF_EVENT_UNTHROTTLE = 6, - - /* - * struct { - * struct perf_event_header header; - * u32 pid, ppid; - * u32 tid, ptid; - * { u64 time; } && PERF_SAMPLE_TIME - * }; - */ - PERF_EVENT_FORK = 7, - - /* - * struct { - * struct perf_event_header header; - * u32 pid, tid; - * - * struct read_format values; - * }; - */ - PERF_EVENT_READ = 8, - - /* - * struct { - * struct perf_event_header header; - * - * { u64 ip; } && PERF_SAMPLE_IP - * { u32 pid, tid; } && PERF_SAMPLE_TID - * { u64 time; } && PERF_SAMPLE_TIME - * { u64 addr; } && PERF_SAMPLE_ADDR - * { u64 id; } && PERF_SAMPLE_ID - * { u64 stream_id;} && PERF_SAMPLE_STREAM_ID - * { u32 cpu, res; } && PERF_SAMPLE_CPU - * { u64 period; } && PERF_SAMPLE_PERIOD - * - * { struct read_format values; } && PERF_SAMPLE_READ - * - * { u64 nr, - * u64 ips[nr]; } && PERF_SAMPLE_CALLCHAIN - * - * # - * # The RAW record below is opaque data wrt the ABI - * # - * # That is, the ABI doesn't make any promises wrt to - * # the stability of its content, it may vary depending - * # on event, hardware, kernel version and phase of - * # the moon. - * # - * # In other words, PERF_SAMPLE_RAW contents are not an ABI. - * # - * - * { u32 size; - * char data[size];}&& PERF_SAMPLE_RAW - * }; - */ - PERF_EVENT_SAMPLE = 9, - - PERF_EVENT_MAX, /* non-ABI */ -}; - -enum perf_callchain_context { - PERF_CONTEXT_HV = (__u64)-32, - PERF_CONTEXT_KERNEL = (__u64)-128, - PERF_CONTEXT_USER = (__u64)-512, - - PERF_CONTEXT_GUEST = (__u64)-2048, - PERF_CONTEXT_GUEST_KERNEL = (__u64)-2176, - PERF_CONTEXT_GUEST_USER = (__u64)-2560, - - PERF_CONTEXT_MAX = (__u64)-4095, -}; - -#define PERF_FLAG_FD_NO_GROUP (1U << 0) -#define PERF_FLAG_FD_OUTPUT (1U << 1) - -#ifdef __KERNEL__ -/* - * Kernel-internal data types and definitions: - */ - -#ifdef CONFIG_PERF_COUNTERS -# include -#endif - -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#define PERF_MAX_STACK_DEPTH 255 - -struct perf_callchain_entry { - __u64 nr; - __u64 ip[PERF_MAX_STACK_DEPTH]; -}; - -struct perf_raw_record { - u32 size; - void *data; -}; - -struct task_struct; - -/** - * struct hw_perf_counter - performance counter hardware details: - */ -struct hw_perf_counter { -#ifdef CONFIG_PERF_COUNTERS - union { - struct { /* hardware */ - u64 config; - unsigned long config_base; - unsigned long counter_base; - int idx; - }; - union { /* software */ - atomic64_t count; - struct hrtimer hrtimer; - }; - }; - atomic64_t prev_count; - u64 sample_period; - u64 last_period; - atomic64_t period_left; - u64 interrupts; - - u64 freq_count; - u64 freq_interrupts; - u64 freq_stamp; -#endif -}; - -struct perf_counter; - -/** - * struct pmu - generic performance monitoring unit - */ -struct pmu { - int (*enable) (struct perf_counter *counter); - void (*disable) (struct perf_counter *counter); - void (*read) (struct perf_counter *counter); - void (*unthrottle) (struct perf_counter *counter); -}; - -/** - * enum perf_counter_active_state - the states of a counter - */ -enum perf_counter_active_state { - PERF_COUNTER_STATE_ERROR = -2, - PERF_COUNTER_STATE_OFF = -1, - PERF_COUNTER_STATE_INACTIVE = 0, - PERF_COUNTER_STATE_ACTIVE = 1, -}; - -struct file; - -struct perf_mmap_data { - struct rcu_head rcu_head; - int nr_pages; /* nr of data pages */ - int writable; /* are we writable */ - int nr_locked; /* nr pages mlocked */ - - atomic_t poll; /* POLL_ for wakeups */ - atomic_t events; /* event limit */ - - atomic_long_t head; /* write position */ - atomic_long_t done_head; /* completed head */ - - atomic_t lock; /* concurrent writes */ - atomic_t wakeup; /* needs a wakeup */ - atomic_t lost; /* nr records lost */ - - long watermark; /* wakeup watermark */ - - struct perf_counter_mmap_page *user_page; - void *data_pages[0]; -}; - -struct perf_pending_entry { - struct perf_pending_entry *next; - void (*func)(struct perf_pending_entry *); -}; - -/** - * struct perf_counter - performance counter kernel representation: - */ -struct perf_counter { -#ifdef CONFIG_PERF_COUNTERS - struct list_head group_entry; - struct list_head event_entry; - struct list_head sibling_list; - int nr_siblings; - struct perf_counter *group_leader; - struct perf_counter *output; - const struct pmu *pmu; - - enum perf_counter_active_state state; - atomic64_t count; - - /* - * These are the total time in nanoseconds that the counter - * has been enabled (i.e. eligible to run, and the task has - * been scheduled in, if this is a per-task counter) - * and running (scheduled onto the CPU), respectively. - * - * They are computed from tstamp_enabled, tstamp_running and - * tstamp_stopped when the counter is in INACTIVE or ACTIVE state. - */ - u64 total_time_enabled; - u64 total_time_running; - - /* - * These are timestamps used for computing total_time_enabled - * and total_time_running when the counter is in INACTIVE or - * ACTIVE state, measured in nanoseconds from an arbitrary point - * in time. - * tstamp_enabled: the notional time when the counter was enabled - * tstamp_running: the notional time when the counter was scheduled on - * tstamp_stopped: in INACTIVE state, the notional time when the - * counter was scheduled off. - */ - u64 tstamp_enabled; - u64 tstamp_running; - u64 tstamp_stopped; - - struct perf_counter_attr attr; - struct hw_perf_counter hw; - - struct perf_counter_context *ctx; - struct file *filp; - - /* - * These accumulate total time (in nanoseconds) that children - * counters have been enabled and running, respectively. - */ - atomic64_t child_total_time_enabled; - atomic64_t child_total_time_running; - - /* - * Protect attach/detach and child_list: - */ - struct mutex child_mutex; - struct list_head child_list; - struct perf_counter *parent; - - int oncpu; - int cpu; - - struct list_head owner_entry; - struct task_struct *owner; - - /* mmap bits */ - struct mutex mmap_mutex; - atomic_t mmap_count; - struct perf_mmap_data *data; - - /* poll related */ - wait_queue_head_t waitq; - struct fasync_struct *fasync; - - /* delayed work for NMIs and such */ - int pending_wakeup; - int pending_kill; - int pending_disable; - struct perf_pending_entry pending; - - atomic_t event_limit; - - void (*destroy)(struct perf_counter *); - struct rcu_head rcu_head; - - struct pid_namespace *ns; - u64 id; -#endif -}; - -/** - * struct perf_counter_context - counter context structure - * - * Used as a container for task counters and CPU counters as well: - */ -struct perf_counter_context { - /* - * Protect the states of the counters in the list, - * nr_active, and the list: - */ - spinlock_t lock; - /* - * Protect the list of counters. Locking either mutex or lock - * is sufficient to ensure the list doesn't change; to change - * the list you need to lock both the mutex and the spinlock. - */ - struct mutex mutex; - - struct list_head group_list; - struct list_head event_list; - int nr_counters; - int nr_active; - int is_active; - int nr_stat; - atomic_t refcount; - struct task_struct *task; - - /* - * Context clock, runs when context enabled. - */ - u64 time; - u64 timestamp; - - /* - * These fields let us detect when two contexts have both - * been cloned (inherited) from a common ancestor. - */ - struct perf_counter_context *parent_ctx; - u64 parent_gen; - u64 generation; - int pin_count; - struct rcu_head rcu_head; -}; - -/** - * struct perf_counter_cpu_context - per cpu counter context structure - */ -struct perf_cpu_context { - struct perf_counter_context ctx; - struct perf_counter_context *task_ctx; - int active_oncpu; - int max_pertask; - int exclusive; - - /* - * Recursion avoidance: - * - * task, softirq, irq, nmi context - */ - int recursion[4]; -}; - -struct perf_output_handle { - struct perf_counter *counter; - struct perf_mmap_data *data; - unsigned long head; - unsigned long offset; - int nmi; - int sample; - int locked; - unsigned long flags; -}; - -#ifdef CONFIG_PERF_COUNTERS - -/* - * Set by architecture code: - */ -extern int perf_max_counters; - -extern const struct pmu *hw_perf_counter_init(struct perf_counter *counter); - -extern void perf_counter_task_sched_in(struct task_struct *task, int cpu); -extern void perf_counter_task_sched_out(struct task_struct *task, - struct task_struct *next, int cpu); -extern void perf_counter_task_tick(struct task_struct *task, int cpu); -extern int perf_counter_init_task(struct task_struct *child); -extern void perf_counter_exit_task(struct task_struct *child); -extern void perf_counter_free_task(struct task_struct *task); -extern void set_perf_counter_pending(void); -extern void perf_counter_do_pending(void); -extern void perf_counter_print_debug(void); -extern void __perf_disable(void); -extern bool __perf_enable(void); -extern void perf_disable(void); -extern void perf_enable(void); -extern int perf_counter_task_disable(void); -extern int perf_counter_task_enable(void); -extern int hw_perf_group_sched_in(struct perf_counter *group_leader, - struct perf_cpu_context *cpuctx, - struct perf_counter_context *ctx, int cpu); -extern void perf_counter_update_userpage(struct perf_counter *counter); - -struct perf_sample_data { - u64 type; - - u64 ip; - struct { - u32 pid; - u32 tid; - } tid_entry; - u64 time; - u64 addr; - u64 id; - u64 stream_id; - struct { - u32 cpu; - u32 reserved; - } cpu_entry; - u64 period; - struct perf_callchain_entry *callchain; - struct perf_raw_record *raw; -}; - -extern void perf_output_sample(struct perf_output_handle *handle, - struct perf_event_header *header, - struct perf_sample_data *data, - struct perf_counter *counter); -extern void perf_prepare_sample(struct perf_event_header *header, - struct perf_sample_data *data, - struct perf_counter *counter, - struct pt_regs *regs); - -extern int perf_counter_overflow(struct perf_counter *counter, int nmi, - struct perf_sample_data *data, - struct pt_regs *regs); - -/* - * Return 1 for a software counter, 0 for a hardware counter - */ -static inline int is_software_counter(struct perf_counter *counter) -{ - return (counter->attr.type != PERF_TYPE_RAW) && - (counter->attr.type != PERF_TYPE_HARDWARE) && - (counter->attr.type != PERF_TYPE_HW_CACHE); -} - -extern atomic_t perf_swcounter_enabled[PERF_COUNT_SW_MAX]; - -extern void __perf_swcounter_event(u32, u64, int, struct pt_regs *, u64); - -static inline void -perf_swcounter_event(u32 event, u64 nr, int nmi, struct pt_regs *regs, u64 addr) -{ - if (atomic_read(&perf_swcounter_enabled[event])) - __perf_swcounter_event(event, nr, nmi, regs, addr); -} - -extern void __perf_counter_mmap(struct vm_area_struct *vma); - -static inline void perf_counter_mmap(struct vm_area_struct *vma) -{ - if (vma->vm_flags & VM_EXEC) - __perf_counter_mmap(vma); -} - -extern void perf_counter_comm(struct task_struct *tsk); -extern void perf_counter_fork(struct task_struct *tsk); - -extern struct perf_callchain_entry *perf_callchain(struct pt_regs *regs); - -extern int sysctl_perf_counter_paranoid; -extern int sysctl_perf_counter_mlock; -extern int sysctl_perf_counter_sample_rate; - -extern void perf_counter_init(void); -extern void perf_tpcounter_event(int event_id, u64 addr, u64 count, - void *record, int entry_size); - -#ifndef perf_misc_flags -#define perf_misc_flags(regs) (user_mode(regs) ? PERF_EVENT_MISC_USER : \ - PERF_EVENT_MISC_KERNEL) -#define perf_instruction_pointer(regs) instruction_pointer(regs) -#endif - -extern int perf_output_begin(struct perf_output_handle *handle, - struct perf_counter *counter, unsigned int size, - int nmi, int sample); -extern void perf_output_end(struct perf_output_handle *handle); -extern void perf_output_copy(struct perf_output_handle *handle, - const void *buf, unsigned int len); -#else -static inline void -perf_counter_task_sched_in(struct task_struct *task, int cpu) { } -static inline void -perf_counter_task_sched_out(struct task_struct *task, - struct task_struct *next, int cpu) { } -static inline void -perf_counter_task_tick(struct task_struct *task, int cpu) { } -static inline int perf_counter_init_task(struct task_struct *child) { return 0; } -static inline void perf_counter_exit_task(struct task_struct *child) { } -static inline void perf_counter_free_task(struct task_struct *task) { } -static inline void perf_counter_do_pending(void) { } -static inline void perf_counter_print_debug(void) { } -static inline void perf_disable(void) { } -static inline void perf_enable(void) { } -static inline int perf_counter_task_disable(void) { return -EINVAL; } -static inline int perf_counter_task_enable(void) { return -EINVAL; } - -static inline void -perf_swcounter_event(u32 event, u64 nr, int nmi, - struct pt_regs *regs, u64 addr) { } - -static inline void perf_counter_mmap(struct vm_area_struct *vma) { } -static inline void perf_counter_comm(struct task_struct *tsk) { } -static inline void perf_counter_fork(struct task_struct *tsk) { } -static inline void perf_counter_init(void) { } - -#endif - -#define perf_output_put(handle, x) \ - perf_output_copy((handle), &(x), sizeof(x)) - -#endif /* __KERNEL__ */ -#endif /* _LINUX_PERF_COUNTER_H */ diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h new file mode 100644 index 000000000000..ae9d9ed6df2a --- /dev/null +++ b/include/linux/perf_event.h @@ -0,0 +1,858 @@ +/* + * Performance events: + * + * Copyright (C) 2008-2009, Thomas Gleixner + * Copyright (C) 2008-2009, Red Hat, Inc., Ingo Molnar + * Copyright (C) 2008-2009, Red Hat, Inc., Peter Zijlstra + * + * Data type definitions, declarations, prototypes. + * + * Started by: Thomas Gleixner and Ingo Molnar + * + * For licencing details see kernel-base/COPYING + */ +#ifndef _LINUX_PERF_EVENT_H +#define _LINUX_PERF_EVENT_H + +#include +#include +#include + +/* + * User-space ABI bits: + */ + +/* + * attr.type + */ +enum perf_type_id { + PERF_TYPE_HARDWARE = 0, + PERF_TYPE_SOFTWARE = 1, + PERF_TYPE_TRACEPOINT = 2, + PERF_TYPE_HW_CACHE = 3, + PERF_TYPE_RAW = 4, + + PERF_TYPE_MAX, /* non-ABI */ +}; + +/* + * Generalized performance event event_id types, used by the + * attr.event_id parameter of the sys_perf_event_open() + * syscall: + */ +enum perf_hw_id { + /* + * Common hardware events, generalized by the kernel: + */ + PERF_COUNT_HW_CPU_CYCLES = 0, + PERF_COUNT_HW_INSTRUCTIONS = 1, + PERF_COUNT_HW_CACHE_REFERENCES = 2, + PERF_COUNT_HW_CACHE_MISSES = 3, + PERF_COUNT_HW_BRANCH_INSTRUCTIONS = 4, + PERF_COUNT_HW_BRANCH_MISSES = 5, + PERF_COUNT_HW_BUS_CYCLES = 6, + + PERF_COUNT_HW_MAX, /* non-ABI */ +}; + +/* + * Generalized hardware cache events: + * + * { L1-D, L1-I, LLC, ITLB, DTLB, BPU } x + * { read, write, prefetch } x + * { accesses, misses } + */ +enum perf_hw_cache_id { + PERF_COUNT_HW_CACHE_L1D = 0, + PERF_COUNT_HW_CACHE_L1I = 1, + PERF_COUNT_HW_CACHE_LL = 2, + PERF_COUNT_HW_CACHE_DTLB = 3, + PERF_COUNT_HW_CACHE_ITLB = 4, + PERF_COUNT_HW_CACHE_BPU = 5, + + PERF_COUNT_HW_CACHE_MAX, /* non-ABI */ +}; + +enum perf_hw_cache_op_id { + PERF_COUNT_HW_CACHE_OP_READ = 0, + PERF_COUNT_HW_CACHE_OP_WRITE = 1, + PERF_COUNT_HW_CACHE_OP_PREFETCH = 2, + + PERF_COUNT_HW_CACHE_OP_MAX, /* non-ABI */ +}; + +enum perf_hw_cache_op_result_id { + PERF_COUNT_HW_CACHE_RESULT_ACCESS = 0, + PERF_COUNT_HW_CACHE_RESULT_MISS = 1, + + PERF_COUNT_HW_CACHE_RESULT_MAX, /* non-ABI */ +}; + +/* + * Special "software" events provided by the kernel, even if the hardware + * does not support performance events. These events measure various + * physical and sw events of the kernel (and allow the profiling of them as + * well): + */ +enum perf_sw_ids { + PERF_COUNT_SW_CPU_CLOCK = 0, + PERF_COUNT_SW_TASK_CLOCK = 1, + PERF_COUNT_SW_PAGE_FAULTS = 2, + PERF_COUNT_SW_CONTEXT_SWITCHES = 3, + PERF_COUNT_SW_CPU_MIGRATIONS = 4, + PERF_COUNT_SW_PAGE_FAULTS_MIN = 5, + PERF_COUNT_SW_PAGE_FAULTS_MAJ = 6, + + PERF_COUNT_SW_MAX, /* non-ABI */ +}; + +/* + * Bits that can be set in attr.sample_type to request information + * in the overflow packets. + */ +enum perf_event_sample_format { + PERF_SAMPLE_IP = 1U << 0, + PERF_SAMPLE_TID = 1U << 1, + PERF_SAMPLE_TIME = 1U << 2, + PERF_SAMPLE_ADDR = 1U << 3, + PERF_SAMPLE_READ = 1U << 4, + PERF_SAMPLE_CALLCHAIN = 1U << 5, + PERF_SAMPLE_ID = 1U << 6, + PERF_SAMPLE_CPU = 1U << 7, + PERF_SAMPLE_PERIOD = 1U << 8, + PERF_SAMPLE_STREAM_ID = 1U << 9, + PERF_SAMPLE_RAW = 1U << 10, + + PERF_SAMPLE_MAX = 1U << 11, /* non-ABI */ +}; + +/* + * The format of the data returned by read() on a perf event fd, + * as specified by attr.read_format: + * + * struct read_format { + * { u64 value; + * { u64 time_enabled; } && PERF_FORMAT_ENABLED + * { u64 time_running; } && PERF_FORMAT_RUNNING + * { u64 id; } && PERF_FORMAT_ID + * } && !PERF_FORMAT_GROUP + * + * { u64 nr; + * { u64 time_enabled; } && PERF_FORMAT_ENABLED + * { u64 time_running; } && PERF_FORMAT_RUNNING + * { u64 value; + * { u64 id; } && PERF_FORMAT_ID + * } cntr[nr]; + * } && PERF_FORMAT_GROUP + * }; + */ +enum perf_event_read_format { + PERF_FORMAT_TOTAL_TIME_ENABLED = 1U << 0, + PERF_FORMAT_TOTAL_TIME_RUNNING = 1U << 1, + PERF_FORMAT_ID = 1U << 2, + PERF_FORMAT_GROUP = 1U << 3, + + PERF_FORMAT_MAX = 1U << 4, /* non-ABI */ +}; + +#define PERF_ATTR_SIZE_VER0 64 /* sizeof first published struct */ + +/* + * Hardware event_id to monitor via a performance monitoring event: + */ +struct perf_event_attr { + + /* + * Major type: hardware/software/tracepoint/etc. + */ + __u32 type; + + /* + * Size of the attr structure, for fwd/bwd compat. + */ + __u32 size; + + /* + * Type specific configuration information. + */ + __u64 config; + + union { + __u64 sample_period; + __u64 sample_freq; + }; + + __u64 sample_type; + __u64 read_format; + + __u64 disabled : 1, /* off by default */ + inherit : 1, /* children inherit it */ + pinned : 1, /* must always be on PMU */ + exclusive : 1, /* only group on PMU */ + exclude_user : 1, /* don't count user */ + exclude_kernel : 1, /* ditto kernel */ + exclude_hv : 1, /* ditto hypervisor */ + exclude_idle : 1, /* don't count when idle */ + mmap : 1, /* include mmap data */ + comm : 1, /* include comm data */ + freq : 1, /* use freq, not period */ + inherit_stat : 1, /* per task counts */ + enable_on_exec : 1, /* next exec enables */ + task : 1, /* trace fork/exit */ + watermark : 1, /* wakeup_watermark */ + + __reserved_1 : 49; + + union { + __u32 wakeup_events; /* wakeup every n events */ + __u32 wakeup_watermark; /* bytes before wakeup */ + }; + __u32 __reserved_2; + + __u64 __reserved_3; +}; + +/* + * Ioctls that can be done on a perf event fd: + */ +#define PERF_EVENT_IOC_ENABLE _IO ('$', 0) +#define PERF_EVENT_IOC_DISABLE _IO ('$', 1) +#define PERF_EVENT_IOC_REFRESH _IO ('$', 2) +#define PERF_EVENT_IOC_RESET _IO ('$', 3) +#define PERF_EVENT_IOC_PERIOD _IOW('$', 4, u64) +#define PERF_EVENT_IOC_SET_OUTPUT _IO ('$', 5) + +enum perf_event_ioc_flags { + PERF_IOC_FLAG_GROUP = 1U << 0, +}; + +/* + * Structure of the page that can be mapped via mmap + */ +struct perf_event_mmap_page { + __u32 version; /* version number of this structure */ + __u32 compat_version; /* lowest version this is compat with */ + + /* + * Bits needed to read the hw events in user-space. + * + * u32 seq; + * s64 count; + * + * do { + * seq = pc->lock; + * + * barrier() + * if (pc->index) { + * count = pmc_read(pc->index - 1); + * count += pc->offset; + * } else + * goto regular_read; + * + * barrier(); + * } while (pc->lock != seq); + * + * NOTE: for obvious reason this only works on self-monitoring + * processes. + */ + __u32 lock; /* seqlock for synchronization */ + __u32 index; /* hardware event identifier */ + __s64 offset; /* add to hardware event value */ + __u64 time_enabled; /* time event active */ + __u64 time_running; /* time event on cpu */ + + /* + * Hole for extension of the self monitor capabilities + */ + + __u64 __reserved[123]; /* align to 1k */ + + /* + * Control data for the mmap() data buffer. + * + * User-space reading the @data_head value should issue an rmb(), on + * SMP capable platforms, after reading this value -- see + * perf_event_wakeup(). + * + * When the mapping is PROT_WRITE the @data_tail value should be + * written by userspace to reflect the last read data. In this case + * the kernel will not over-write unread data. + */ + __u64 data_head; /* head in the data section */ + __u64 data_tail; /* user-space written tail */ +}; + +#define PERF_RECORD_MISC_CPUMODE_MASK (3 << 0) +#define PERF_RECORD_MISC_CPUMODE_UNKNOWN (0 << 0) +#define PERF_RECORD_MISC_KERNEL (1 << 0) +#define PERF_RECORD_MISC_USER (2 << 0) +#define PERF_RECORD_MISC_HYPERVISOR (3 << 0) + +struct perf_event_header { + __u32 type; + __u16 misc; + __u16 size; +}; + +enum perf_event_type { + + /* + * The MMAP events record the PROT_EXEC mappings so that we can + * correlate userspace IPs to code. They have the following structure: + * + * struct { + * struct perf_event_header header; + * + * u32 pid, tid; + * u64 addr; + * u64 len; + * u64 pgoff; + * char filename[]; + * }; + */ + PERF_RECORD_MMAP = 1, + + /* + * struct { + * struct perf_event_header header; + * u64 id; + * u64 lost; + * }; + */ + PERF_RECORD_LOST = 2, + + /* + * struct { + * struct perf_event_header header; + * + * u32 pid, tid; + * char comm[]; + * }; + */ + PERF_RECORD_COMM = 3, + + /* + * struct { + * struct perf_event_header header; + * u32 pid, ppid; + * u32 tid, ptid; + * u64 time; + * }; + */ + PERF_RECORD_EXIT = 4, + + /* + * struct { + * struct perf_event_header header; + * u64 time; + * u64 id; + * u64 stream_id; + * }; + */ + PERF_RECORD_THROTTLE = 5, + PERF_RECORD_UNTHROTTLE = 6, + + /* + * struct { + * struct perf_event_header header; + * u32 pid, ppid; + * u32 tid, ptid; + * { u64 time; } && PERF_SAMPLE_TIME + * }; + */ + PERF_RECORD_FORK = 7, + + /* + * struct { + * struct perf_event_header header; + * u32 pid, tid; + * + * struct read_format values; + * }; + */ + PERF_RECORD_READ = 8, + + /* + * struct { + * struct perf_event_header header; + * + * { u64 ip; } && PERF_SAMPLE_IP + * { u32 pid, tid; } && PERF_SAMPLE_TID + * { u64 time; } && PERF_SAMPLE_TIME + * { u64 addr; } && PERF_SAMPLE_ADDR + * { u64 id; } && PERF_SAMPLE_ID + * { u64 stream_id;} && PERF_SAMPLE_STREAM_ID + * { u32 cpu, res; } && PERF_SAMPLE_CPU + * { u64 period; } && PERF_SAMPLE_PERIOD + * + * { struct read_format values; } && PERF_SAMPLE_READ + * + * { u64 nr, + * u64 ips[nr]; } && PERF_SAMPLE_CALLCHAIN + * + * # + * # The RAW record below is opaque data wrt the ABI + * # + * # That is, the ABI doesn't make any promises wrt to + * # the stability of its content, it may vary depending + * # on event_id, hardware, kernel version and phase of + * # the moon. + * # + * # In other words, PERF_SAMPLE_RAW contents are not an ABI. + * # + * + * { u32 size; + * char data[size];}&& PERF_SAMPLE_RAW + * }; + */ + PERF_RECORD_SAMPLE = 9, + + PERF_RECORD_MAX, /* non-ABI */ +}; + +enum perf_callchain_context { + PERF_CONTEXT_HV = (__u64)-32, + PERF_CONTEXT_KERNEL = (__u64)-128, + PERF_CONTEXT_USER = (__u64)-512, + + PERF_CONTEXT_GUEST = (__u64)-2048, + PERF_CONTEXT_GUEST_KERNEL = (__u64)-2176, + PERF_CONTEXT_GUEST_USER = (__u64)-2560, + + PERF_CONTEXT_MAX = (__u64)-4095, +}; + +#define PERF_FLAG_FD_NO_GROUP (1U << 0) +#define PERF_FLAG_FD_OUTPUT (1U << 1) + +#ifdef __KERNEL__ +/* + * Kernel-internal data types and definitions: + */ + +#ifdef CONFIG_PERF_EVENTS +# include +#endif + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define PERF_MAX_STACK_DEPTH 255 + +struct perf_callchain_entry { + __u64 nr; + __u64 ip[PERF_MAX_STACK_DEPTH]; +}; + +struct perf_raw_record { + u32 size; + void *data; +}; + +struct task_struct; + +/** + * struct hw_perf_event - performance event hardware details: + */ +struct hw_perf_event { +#ifdef CONFIG_PERF_EVENTS + union { + struct { /* hardware */ + u64 config; + unsigned long config_base; + unsigned long event_base; + int idx; + }; + union { /* software */ + atomic64_t count; + struct hrtimer hrtimer; + }; + }; + atomic64_t prev_count; + u64 sample_period; + u64 last_period; + atomic64_t period_left; + u64 interrupts; + + u64 freq_count; + u64 freq_interrupts; + u64 freq_stamp; +#endif +}; + +struct perf_event; + +/** + * struct pmu - generic performance monitoring unit + */ +struct pmu { + int (*enable) (struct perf_event *event); + void (*disable) (struct perf_event *event); + void (*read) (struct perf_event *event); + void (*unthrottle) (struct perf_event *event); +}; + +/** + * enum perf_event_active_state - the states of a event + */ +enum perf_event_active_state { + PERF_EVENT_STATE_ERROR = -2, + PERF_EVENT_STATE_OFF = -1, + PERF_EVENT_STATE_INACTIVE = 0, + PERF_EVENT_STATE_ACTIVE = 1, +}; + +struct file; + +struct perf_mmap_data { + struct rcu_head rcu_head; + int nr_pages; /* nr of data pages */ + int writable; /* are we writable */ + int nr_locked; /* nr pages mlocked */ + + atomic_t poll; /* POLL_ for wakeups */ + atomic_t events; /* event_id limit */ + + atomic_long_t head; /* write position */ + atomic_long_t done_head; /* completed head */ + + atomic_t lock; /* concurrent writes */ + atomic_t wakeup; /* needs a wakeup */ + atomic_t lost; /* nr records lost */ + + long watermark; /* wakeup watermark */ + + struct perf_event_mmap_page *user_page; + void *data_pages[0]; +}; + +struct perf_pending_entry { + struct perf_pending_entry *next; + void (*func)(struct perf_pending_entry *); +}; + +/** + * struct perf_event - performance event kernel representation: + */ +struct perf_event { +#ifdef CONFIG_PERF_EVENTS + struct list_head group_entry; + struct list_head event_entry; + struct list_head sibling_list; + int nr_siblings; + struct perf_event *group_leader; + struct perf_event *output; + const struct pmu *pmu; + + enum perf_event_active_state state; + atomic64_t count; + + /* + * These are the total time in nanoseconds that the event + * has been enabled (i.e. eligible to run, and the task has + * been scheduled in, if this is a per-task event) + * and running (scheduled onto the CPU), respectively. + * + * They are computed from tstamp_enabled, tstamp_running and + * tstamp_stopped when the event is in INACTIVE or ACTIVE state. + */ + u64 total_time_enabled; + u64 total_time_running; + + /* + * These are timestamps used for computing total_time_enabled + * and total_time_running when the event is in INACTIVE or + * ACTIVE state, measured in nanoseconds from an arbitrary point + * in time. + * tstamp_enabled: the notional time when the event was enabled + * tstamp_running: the notional time when the event was scheduled on + * tstamp_stopped: in INACTIVE state, the notional time when the + * event was scheduled off. + */ + u64 tstamp_enabled; + u64 tstamp_running; + u64 tstamp_stopped; + + struct perf_event_attr attr; + struct hw_perf_event hw; + + struct perf_event_context *ctx; + struct file *filp; + + /* + * These accumulate total time (in nanoseconds) that children + * events have been enabled and running, respectively. + */ + atomic64_t child_total_time_enabled; + atomic64_t child_total_time_running; + + /* + * Protect attach/detach and child_list: + */ + struct mutex child_mutex; + struct list_head child_list; + struct perf_event *parent; + + int oncpu; + int cpu; + + struct list_head owner_entry; + struct task_struct *owner; + + /* mmap bits */ + struct mutex mmap_mutex; + atomic_t mmap_count; + struct perf_mmap_data *data; + + /* poll related */ + wait_queue_head_t waitq; + struct fasync_struct *fasync; + + /* delayed work for NMIs and such */ + int pending_wakeup; + int pending_kill; + int pending_disable; + struct perf_pending_entry pending; + + atomic_t event_limit; + + void (*destroy)(struct perf_event *); + struct rcu_head rcu_head; + + struct pid_namespace *ns; + u64 id; +#endif +}; + +/** + * struct perf_event_context - event context structure + * + * Used as a container for task events and CPU events as well: + */ +struct perf_event_context { + /* + * Protect the states of the events in the list, + * nr_active, and the list: + */ + spinlock_t lock; + /* + * Protect the list of events. Locking either mutex or lock + * is sufficient to ensure the list doesn't change; to change + * the list you need to lock both the mutex and the spinlock. + */ + struct mutex mutex; + + struct list_head group_list; + struct list_head event_list; + int nr_events; + int nr_active; + int is_active; + int nr_stat; + atomic_t refcount; + struct task_struct *task; + + /* + * Context clock, runs when context enabled. + */ + u64 time; + u64 timestamp; + + /* + * These fields let us detect when two contexts have both + * been cloned (inherited) from a common ancestor. + */ + struct perf_event_context *parent_ctx; + u64 parent_gen; + u64 generation; + int pin_count; + struct rcu_head rcu_head; +}; + +/** + * struct perf_event_cpu_context - per cpu event context structure + */ +struct perf_cpu_context { + struct perf_event_context ctx; + struct perf_event_context *task_ctx; + int active_oncpu; + int max_pertask; + int exclusive; + + /* + * Recursion avoidance: + * + * task, softirq, irq, nmi context + */ + int recursion[4]; +}; + +struct perf_output_handle { + struct perf_event *event; + struct perf_mmap_data *data; + unsigned long head; + unsigned long offset; + int nmi; + int sample; + int locked; + unsigned long flags; +}; + +#ifdef CONFIG_PERF_EVENTS + +/* + * Set by architecture code: + */ +extern int perf_max_events; + +extern const struct pmu *hw_perf_event_init(struct perf_event *event); + +extern void perf_event_task_sched_in(struct task_struct *task, int cpu); +extern void perf_event_task_sched_out(struct task_struct *task, + struct task_struct *next, int cpu); +extern void perf_event_task_tick(struct task_struct *task, int cpu); +extern int perf_event_init_task(struct task_struct *child); +extern void perf_event_exit_task(struct task_struct *child); +extern void perf_event_free_task(struct task_struct *task); +extern void set_perf_event_pending(void); +extern void perf_event_do_pending(void); +extern void perf_event_print_debug(void); +extern void __perf_disable(void); +extern bool __perf_enable(void); +extern void perf_disable(void); +extern void perf_enable(void); +extern int perf_event_task_disable(void); +extern int perf_event_task_enable(void); +extern int hw_perf_group_sched_in(struct perf_event *group_leader, + struct perf_cpu_context *cpuctx, + struct perf_event_context *ctx, int cpu); +extern void perf_event_update_userpage(struct perf_event *event); + +struct perf_sample_data { + u64 type; + + u64 ip; + struct { + u32 pid; + u32 tid; + } tid_entry; + u64 time; + u64 addr; + u64 id; + u64 stream_id; + struct { + u32 cpu; + u32 reserved; + } cpu_entry; + u64 period; + struct perf_callchain_entry *callchain; + struct perf_raw_record *raw; +}; + +extern void perf_output_sample(struct perf_output_handle *handle, + struct perf_event_header *header, + struct perf_sample_data *data, + struct perf_event *event); +extern void perf_prepare_sample(struct perf_event_header *header, + struct perf_sample_data *data, + struct perf_event *event, + struct pt_regs *regs); + +extern int perf_event_overflow(struct perf_event *event, int nmi, + struct perf_sample_data *data, + struct pt_regs *regs); + +/* + * Return 1 for a software event, 0 for a hardware event + */ +static inline int is_software_event(struct perf_event *event) +{ + return (event->attr.type != PERF_TYPE_RAW) && + (event->attr.type != PERF_TYPE_HARDWARE) && + (event->attr.type != PERF_TYPE_HW_CACHE); +} + +extern atomic_t perf_swevent_enabled[PERF_COUNT_SW_MAX]; + +extern void __perf_sw_event(u32, u64, int, struct pt_regs *, u64); + +static inline void +perf_sw_event(u32 event_id, u64 nr, int nmi, struct pt_regs *regs, u64 addr) +{ + if (atomic_read(&perf_swevent_enabled[event_id])) + __perf_sw_event(event_id, nr, nmi, regs, addr); +} + +extern void __perf_event_mmap(struct vm_area_struct *vma); + +static inline void perf_event_mmap(struct vm_area_struct *vma) +{ + if (vma->vm_flags & VM_EXEC) + __perf_event_mmap(vma); +} + +extern void perf_event_comm(struct task_struct *tsk); +extern void perf_event_fork(struct task_struct *tsk); + +extern struct perf_callchain_entry *perf_callchain(struct pt_regs *regs); + +extern int sysctl_perf_event_paranoid; +extern int sysctl_perf_event_mlock; +extern int sysctl_perf_event_sample_rate; + +extern void perf_event_init(void); +extern void perf_tp_event(int event_id, u64 addr, u64 count, + void *record, int entry_size); + +#ifndef perf_misc_flags +#define perf_misc_flags(regs) (user_mode(regs) ? PERF_RECORD_MISC_USER : \ + PERF_RECORD_MISC_KERNEL) +#define perf_instruction_pointer(regs) instruction_pointer(regs) +#endif + +extern int perf_output_begin(struct perf_output_handle *handle, + struct perf_event *event, unsigned int size, + int nmi, int sample); +extern void perf_output_end(struct perf_output_handle *handle); +extern void perf_output_copy(struct perf_output_handle *handle, + const void *buf, unsigned int len); +#else +static inline void +perf_event_task_sched_in(struct task_struct *task, int cpu) { } +static inline void +perf_event_task_sched_out(struct task_struct *task, + struct task_struct *next, int cpu) { } +static inline void +perf_event_task_tick(struct task_struct *task, int cpu) { } +static inline int perf_event_init_task(struct task_struct *child) { return 0; } +static inline void perf_event_exit_task(struct task_struct *child) { } +static inline void perf_event_free_task(struct task_struct *task) { } +static inline void perf_event_do_pending(void) { } +static inline void perf_event_print_debug(void) { } +static inline void perf_disable(void) { } +static inline void perf_enable(void) { } +static inline int perf_event_task_disable(void) { return -EINVAL; } +static inline int perf_event_task_enable(void) { return -EINVAL; } + +static inline void +perf_sw_event(u32 event_id, u64 nr, int nmi, + struct pt_regs *regs, u64 addr) { } + +static inline void perf_event_mmap(struct vm_area_struct *vma) { } +static inline void perf_event_comm(struct task_struct *tsk) { } +static inline void perf_event_fork(struct task_struct *tsk) { } +static inline void perf_event_init(void) { } + +#endif + +#define perf_output_put(handle, x) \ + perf_output_copy((handle), &(x), sizeof(x)) + +#endif /* __KERNEL__ */ +#endif /* _LINUX_PERF_EVENT_H */ diff --git a/include/linux/prctl.h b/include/linux/prctl.h index b00df4c79c63..07bff666e65b 100644 --- a/include/linux/prctl.h +++ b/include/linux/prctl.h @@ -85,7 +85,7 @@ #define PR_SET_TIMERSLACK 29 #define PR_GET_TIMERSLACK 30 -#define PR_TASK_PERF_COUNTERS_DISABLE 31 -#define PR_TASK_PERF_COUNTERS_ENABLE 32 +#define PR_TASK_PERF_EVENTS_DISABLE 31 +#define PR_TASK_PERF_EVENTS_ENABLE 32 #endif /* _LINUX_PRCTL_H */ diff --git a/include/linux/sched.h b/include/linux/sched.h index 8af3d249170e..8b265a8986d0 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -100,7 +100,7 @@ struct robust_list_head; struct bio; struct fs_struct; struct bts_context; -struct perf_counter_context; +struct perf_event_context; /* * List of flags we want to share for kernel threads, @@ -701,7 +701,7 @@ struct user_struct { #endif #endif -#ifdef CONFIG_PERF_COUNTERS +#ifdef CONFIG_PERF_EVENTS atomic_long_t locked_vm; #endif }; @@ -1449,10 +1449,10 @@ struct task_struct { struct list_head pi_state_list; struct futex_pi_state *pi_state_cache; #endif -#ifdef CONFIG_PERF_COUNTERS - struct perf_counter_context *perf_counter_ctxp; - struct mutex perf_counter_mutex; - struct list_head perf_counter_list; +#ifdef CONFIG_PERF_EVENTS + struct perf_event_context *perf_event_ctxp; + struct mutex perf_event_mutex; + struct list_head perf_event_list; #endif #ifdef CONFIG_NUMA struct mempolicy *mempolicy; /* Protected by alloc_lock */ diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index a8e37821cc60..02f19f9a76c6 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -55,7 +55,7 @@ struct compat_timeval; struct robust_list_head; struct getcpu_cache; struct old_linux_dirent; -struct perf_counter_attr; +struct perf_event_attr; #include #include @@ -885,7 +885,7 @@ asmlinkage long sys_ppoll(struct pollfd __user *, unsigned int, int kernel_execve(const char *filename, char *const argv[], char *const envp[]); -asmlinkage long sys_perf_counter_open( - struct perf_counter_attr __user *attr_uptr, +asmlinkage long sys_perf_event_open( + struct perf_event_attr __user *attr_uptr, pid_t pid, int cpu, int group_fd, unsigned long flags); #endif diff --git a/include/trace/ftrace.h b/include/trace/ftrace.h index 72a3b437b829..ec91e78244f0 100644 --- a/include/trace/ftrace.h +++ b/include/trace/ftrace.h @@ -378,7 +378,7 @@ static inline int ftrace_get_offsets_##call( \ #ifdef CONFIG_EVENT_PROFILE /* - * Generate the functions needed for tracepoint perf_counter support. + * Generate the functions needed for tracepoint perf_event support. * * NOTE: The insertion profile callback (ftrace_profile_) is defined later * @@ -656,7 +656,7 @@ __attribute__((section("_ftrace_events"))) event_##call = { \ * { * struct ftrace_data_offsets_ __maybe_unused __data_offsets; * struct ftrace_event_call *event_call = &event_; - * extern void perf_tpcounter_event(int, u64, u64, void *, int); + * extern void perf_tp_event(int, u64, u64, void *, int); * struct ftrace_raw_##call *entry; * u64 __addr = 0, __count = 1; * unsigned long irq_flags; @@ -691,7 +691,7 @@ __attribute__((section("_ftrace_events"))) event_##call = { \ * * <- affect our values * - * perf_tpcounter_event(event_call->id, __addr, __count, entry, + * perf_tp_event(event_call->id, __addr, __count, entry, * __entry_size); <- submit them to perf counter * } while (0); * @@ -712,7 +712,7 @@ static void ftrace_profile_##call(proto) \ { \ struct ftrace_data_offsets_##call __maybe_unused __data_offsets;\ struct ftrace_event_call *event_call = &event_##call; \ - extern void perf_tpcounter_event(int, u64, u64, void *, int); \ + extern void perf_tp_event(int, u64, u64, void *, int); \ struct ftrace_raw_##call *entry; \ u64 __addr = 0, __count = 1; \ unsigned long irq_flags; \ @@ -742,7 +742,7 @@ static void ftrace_profile_##call(proto) \ \ { assign; } \ \ - perf_tpcounter_event(event_call->id, __addr, __count, entry,\ + perf_tp_event(event_call->id, __addr, __count, entry,\ __entry_size); \ } while (0); \ \ diff --git a/init/Kconfig b/init/Kconfig index 8e8b76d8a272..cfdf5c322806 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -915,17 +915,17 @@ config AIO by some high performance threaded applications. Disabling this option saves about 7k. -config HAVE_PERF_COUNTERS +config HAVE_PERF_EVENTS bool help See tools/perf/design.txt for details. menu "Performance Counters" -config PERF_COUNTERS +config PERF_EVENTS bool "Kernel Performance Counters" default y if PROFILING - depends on HAVE_PERF_COUNTERS + depends on HAVE_PERF_EVENTS select ANON_INODES help Enable kernel support for performance counter hardware. @@ -947,7 +947,7 @@ config PERF_COUNTERS config EVENT_PROFILE bool "Tracepoint profiling sources" - depends on PERF_COUNTERS && EVENT_TRACING + depends on PERF_EVENTS && EVENT_TRACING default y help Allow the use of tracepoints as software performance counters. diff --git a/kernel/Makefile b/kernel/Makefile index 3d9c7e27e3f9..e26a546eac44 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -96,7 +96,7 @@ obj-$(CONFIG_X86_DS) += trace/ obj-$(CONFIG_RING_BUFFER) += trace/ obj-$(CONFIG_SMP) += sched_cpupri.o obj-$(CONFIG_SLOW_WORK) += slow-work.o -obj-$(CONFIG_PERF_COUNTERS) += perf_counter.o +obj-$(CONFIG_PERF_EVENTS) += perf_event.o ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y) # According to Alan Modra , the -fno-omit-frame-pointer is diff --git a/kernel/exit.c b/kernel/exit.c index ae5d8660ddff..e47ee8a06135 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -47,7 +47,7 @@ #include #include #include -#include +#include #include #include @@ -154,8 +154,8 @@ static void delayed_put_task_struct(struct rcu_head *rhp) { struct task_struct *tsk = container_of(rhp, struct task_struct, rcu); -#ifdef CONFIG_PERF_COUNTERS - WARN_ON_ONCE(tsk->perf_counter_ctxp); +#ifdef CONFIG_PERF_EVENTS + WARN_ON_ONCE(tsk->perf_event_ctxp); #endif trace_sched_process_free(tsk); put_task_struct(tsk); @@ -981,7 +981,7 @@ NORET_TYPE void do_exit(long code) * Flush inherited counters to the parent - before the parent * gets woken up by child-exit notifications. */ - perf_counter_exit_task(tsk); + perf_event_exit_task(tsk); exit_notify(tsk, group_dead); #ifdef CONFIG_NUMA diff --git a/kernel/fork.c b/kernel/fork.c index bfee931ee3fb..2cebfb23b0b8 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -61,7 +61,7 @@ #include #include #include -#include +#include #include #include @@ -1078,7 +1078,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, /* Perform scheduler related setup. Assign this task to a CPU. */ sched_fork(p, clone_flags); - retval = perf_counter_init_task(p); + retval = perf_event_init_task(p); if (retval) goto bad_fork_cleanup_policy; @@ -1253,7 +1253,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, write_unlock_irq(&tasklist_lock); proc_fork_connector(p); cgroup_post_fork(p); - perf_counter_fork(p); + perf_event_fork(p); return p; bad_fork_free_pid: @@ -1280,7 +1280,7 @@ bad_fork_cleanup_semundo: bad_fork_cleanup_audit: audit_free(p); bad_fork_cleanup_policy: - perf_counter_free_task(p); + perf_event_free_task(p); #ifdef CONFIG_NUMA mpol_put(p->mempolicy); bad_fork_cleanup_cgroup: diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c deleted file mode 100644 index 62de0db8092b..000000000000 --- a/kernel/perf_counter.c +++ /dev/null @@ -1,5000 +0,0 @@ -/* - * Performance counter core code - * - * Copyright (C) 2008 Thomas Gleixner - * Copyright (C) 2008-2009 Red Hat, Inc., Ingo Molnar - * Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra - * Copyright © 2009 Paul Mackerras, IBM Corp. - * - * For licensing details see kernel-base/COPYING - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include - -/* - * Each CPU has a list of per CPU counters: - */ -DEFINE_PER_CPU(struct perf_cpu_context, perf_cpu_context); - -int perf_max_counters __read_mostly = 1; -static int perf_reserved_percpu __read_mostly; -static int perf_overcommit __read_mostly = 1; - -static atomic_t nr_counters __read_mostly; -static atomic_t nr_mmap_counters __read_mostly; -static atomic_t nr_comm_counters __read_mostly; -static atomic_t nr_task_counters __read_mostly; - -/* - * perf counter paranoia level: - * -1 - not paranoid at all - * 0 - disallow raw tracepoint access for unpriv - * 1 - disallow cpu counters for unpriv - * 2 - disallow kernel profiling for unpriv - */ -int sysctl_perf_counter_paranoid __read_mostly = 1; - -static inline bool perf_paranoid_tracepoint_raw(void) -{ - return sysctl_perf_counter_paranoid > -1; -} - -static inline bool perf_paranoid_cpu(void) -{ - return sysctl_perf_counter_paranoid > 0; -} - -static inline bool perf_paranoid_kernel(void) -{ - return sysctl_perf_counter_paranoid > 1; -} - -int sysctl_perf_counter_mlock __read_mostly = 512; /* 'free' kb per user */ - -/* - * max perf counter sample rate - */ -int sysctl_perf_counter_sample_rate __read_mostly = 100000; - -static atomic64_t perf_counter_id; - -/* - * Lock for (sysadmin-configurable) counter reservations: - */ -static DEFINE_SPINLOCK(perf_resource_lock); - -/* - * Architecture provided APIs - weak aliases: - */ -extern __weak const struct pmu *hw_perf_counter_init(struct perf_counter *counter) -{ - return NULL; -} - -void __weak hw_perf_disable(void) { barrier(); } -void __weak hw_perf_enable(void) { barrier(); } - -void __weak hw_perf_counter_setup(int cpu) { barrier(); } -void __weak hw_perf_counter_setup_online(int cpu) { barrier(); } - -int __weak -hw_perf_group_sched_in(struct perf_counter *group_leader, - struct perf_cpu_context *cpuctx, - struct perf_counter_context *ctx, int cpu) -{ - return 0; -} - -void __weak perf_counter_print_debug(void) { } - -static DEFINE_PER_CPU(int, perf_disable_count); - -void __perf_disable(void) -{ - __get_cpu_var(perf_disable_count)++; -} - -bool __perf_enable(void) -{ - return !--__get_cpu_var(perf_disable_count); -} - -void perf_disable(void) -{ - __perf_disable(); - hw_perf_disable(); -} - -void perf_enable(void) -{ - if (__perf_enable()) - hw_perf_enable(); -} - -static void get_ctx(struct perf_counter_context *ctx) -{ - WARN_ON(!atomic_inc_not_zero(&ctx->refcount)); -} - -static void free_ctx(struct rcu_head *head) -{ - struct perf_counter_context *ctx; - - ctx = container_of(head, struct perf_counter_context, rcu_head); - kfree(ctx); -} - -static void put_ctx(struct perf_counter_context *ctx) -{ - if (atomic_dec_and_test(&ctx->refcount)) { - if (ctx->parent_ctx) - put_ctx(ctx->parent_ctx); - if (ctx->task) - put_task_struct(ctx->task); - call_rcu(&ctx->rcu_head, free_ctx); - } -} - -static void unclone_ctx(struct perf_counter_context *ctx) -{ - if (ctx->parent_ctx) { - put_ctx(ctx->parent_ctx); - ctx->parent_ctx = NULL; - } -} - -/* - * If we inherit counters we want to return the parent counter id - * to userspace. - */ -static u64 primary_counter_id(struct perf_counter *counter) -{ - u64 id = counter->id; - - if (counter->parent) - id = counter->parent->id; - - return id; -} - -/* - * Get the perf_counter_context for a task and lock it. - * This has to cope with with the fact that until it is locked, - * the context could get moved to another task. - */ -static struct perf_counter_context * -perf_lock_task_context(struct task_struct *task, unsigned long *flags) -{ - struct perf_counter_context *ctx; - - rcu_read_lock(); - retry: - ctx = rcu_dereference(task->perf_counter_ctxp); - if (ctx) { - /* - * If this context is a clone of another, it might - * get swapped for another underneath us by - * perf_counter_task_sched_out, though the - * rcu_read_lock() protects us from any context - * getting freed. Lock the context and check if it - * got swapped before we could get the lock, and retry - * if so. If we locked the right context, then it - * can't get swapped on us any more. - */ - spin_lock_irqsave(&ctx->lock, *flags); - if (ctx != rcu_dereference(task->perf_counter_ctxp)) { - spin_unlock_irqrestore(&ctx->lock, *flags); - goto retry; - } - - if (!atomic_inc_not_zero(&ctx->refcount)) { - spin_unlock_irqrestore(&ctx->lock, *flags); - ctx = NULL; - } - } - rcu_read_unlock(); - return ctx; -} - -/* - * Get the context for a task and increment its pin_count so it - * can't get swapped to another task. This also increments its - * reference count so that the context can't get freed. - */ -static struct perf_counter_context *perf_pin_task_context(struct task_struct *task) -{ - struct perf_counter_context *ctx; - unsigned long flags; - - ctx = perf_lock_task_context(task, &flags); - if (ctx) { - ++ctx->pin_count; - spin_unlock_irqrestore(&ctx->lock, flags); - } - return ctx; -} - -static void perf_unpin_context(struct perf_counter_context *ctx) -{ - unsigned long flags; - - spin_lock_irqsave(&ctx->lock, flags); - --ctx->pin_count; - spin_unlock_irqrestore(&ctx->lock, flags); - put_ctx(ctx); -} - -/* - * Add a counter from the lists for its context. - * Must be called with ctx->mutex and ctx->lock held. - */ -static void -list_add_counter(struct perf_counter *counter, struct perf_counter_context *ctx) -{ - struct perf_counter *group_leader = counter->group_leader; - - /* - * Depending on whether it is a standalone or sibling counter, - * add it straight to the context's counter list, or to the group - * leader's sibling list: - */ - if (group_leader == counter) - list_add_tail(&counter->group_entry, &ctx->group_list); - else { - list_add_tail(&counter->group_entry, &group_leader->sibling_list); - group_leader->nr_siblings++; - } - - list_add_rcu(&counter->event_entry, &ctx->event_list); - ctx->nr_counters++; - if (counter->attr.inherit_stat) - ctx->nr_stat++; -} - -/* - * Remove a counter from the lists for its context. - * Must be called with ctx->mutex and ctx->lock held. - */ -static void -list_del_counter(struct perf_counter *counter, struct perf_counter_context *ctx) -{ - struct perf_counter *sibling, *tmp; - - if (list_empty(&counter->group_entry)) - return; - ctx->nr_counters--; - if (counter->attr.inherit_stat) - ctx->nr_stat--; - - list_del_init(&counter->group_entry); - list_del_rcu(&counter->event_entry); - - if (counter->group_leader != counter) - counter->group_leader->nr_siblings--; - - /* - * If this was a group counter with sibling counters then - * upgrade the siblings to singleton counters by adding them - * to the context list directly: - */ - list_for_each_entry_safe(sibling, tmp, &counter->sibling_list, group_entry) { - - list_move_tail(&sibling->group_entry, &ctx->group_list); - sibling->group_leader = sibling; - } -} - -static void -counter_sched_out(struct perf_counter *counter, - struct perf_cpu_context *cpuctx, - struct perf_counter_context *ctx) -{ - if (counter->state != PERF_COUNTER_STATE_ACTIVE) - return; - - counter->state = PERF_COUNTER_STATE_INACTIVE; - if (counter->pending_disable) { - counter->pending_disable = 0; - counter->state = PERF_COUNTER_STATE_OFF; - } - counter->tstamp_stopped = ctx->time; - counter->pmu->disable(counter); - counter->oncpu = -1; - - if (!is_software_counter(counter)) - cpuctx->active_oncpu--; - ctx->nr_active--; - if (counter->attr.exclusive || !cpuctx->active_oncpu) - cpuctx->exclusive = 0; -} - -static void -group_sched_out(struct perf_counter *group_counter, - struct perf_cpu_context *cpuctx, - struct perf_counter_context *ctx) -{ - struct perf_counter *counter; - - if (group_counter->state != PERF_COUNTER_STATE_ACTIVE) - return; - - counter_sched_out(group_counter, cpuctx, ctx); - - /* - * Schedule out siblings (if any): - */ - list_for_each_entry(counter, &group_counter->sibling_list, group_entry) - counter_sched_out(counter, cpuctx, ctx); - - if (group_counter->attr.exclusive) - cpuctx->exclusive = 0; -} - -/* - * Cross CPU call to remove a performance counter - * - * We disable the counter on the hardware level first. After that we - * remove it from the context list. - */ -static void __perf_counter_remove_from_context(void *info) -{ - struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); - struct perf_counter *counter = info; - struct perf_counter_context *ctx = counter->ctx; - - /* - * If this is a task context, we need to check whether it is - * the current task context of this cpu. If not it has been - * scheduled out before the smp call arrived. - */ - if (ctx->task && cpuctx->task_ctx != ctx) - return; - - spin_lock(&ctx->lock); - /* - * Protect the list operation against NMI by disabling the - * counters on a global level. - */ - perf_disable(); - - counter_sched_out(counter, cpuctx, ctx); - - list_del_counter(counter, ctx); - - if (!ctx->task) { - /* - * Allow more per task counters with respect to the - * reservation: - */ - cpuctx->max_pertask = - min(perf_max_counters - ctx->nr_counters, - perf_max_counters - perf_reserved_percpu); - } - - perf_enable(); - spin_unlock(&ctx->lock); -} - - -/* - * Remove the counter from a task's (or a CPU's) list of counters. - * - * Must be called with ctx->mutex held. - * - * CPU counters are removed with a smp call. For task counters we only - * call when the task is on a CPU. - * - * If counter->ctx is a cloned context, callers must make sure that - * every task struct that counter->ctx->task could possibly point to - * remains valid. This is OK when called from perf_release since - * that only calls us on the top-level context, which can't be a clone. - * When called from perf_counter_exit_task, it's OK because the - * context has been detached from its task. - */ -static void perf_counter_remove_from_context(struct perf_counter *counter) -{ - struct perf_counter_context *ctx = counter->ctx; - struct task_struct *task = ctx->task; - - if (!task) { - /* - * Per cpu counters are removed via an smp call and - * the removal is always sucessful. - */ - smp_call_function_single(counter->cpu, - __perf_counter_remove_from_context, - counter, 1); - return; - } - -retry: - task_oncpu_function_call(task, __perf_counter_remove_from_context, - counter); - - spin_lock_irq(&ctx->lock); - /* - * If the context is active we need to retry the smp call. - */ - if (ctx->nr_active && !list_empty(&counter->group_entry)) { - spin_unlock_irq(&ctx->lock); - goto retry; - } - - /* - * The lock prevents that this context is scheduled in so we - * can remove the counter safely, if the call above did not - * succeed. - */ - if (!list_empty(&counter->group_entry)) { - list_del_counter(counter, ctx); - } - spin_unlock_irq(&ctx->lock); -} - -static inline u64 perf_clock(void) -{ - return cpu_clock(smp_processor_id()); -} - -/* - * Update the record of the current time in a context. - */ -static void update_context_time(struct perf_counter_context *ctx) -{ - u64 now = perf_clock(); - - ctx->time += now - ctx->timestamp; - ctx->timestamp = now; -} - -/* - * Update the total_time_enabled and total_time_running fields for a counter. - */ -static void update_counter_times(struct perf_counter *counter) -{ - struct perf_counter_context *ctx = counter->ctx; - u64 run_end; - - if (counter->state < PERF_COUNTER_STATE_INACTIVE || - counter->group_leader->state < PERF_COUNTER_STATE_INACTIVE) - return; - - counter->total_time_enabled = ctx->time - counter->tstamp_enabled; - - if (counter->state == PERF_COUNTER_STATE_INACTIVE) - run_end = counter->tstamp_stopped; - else - run_end = ctx->time; - - counter->total_time_running = run_end - counter->tstamp_running; -} - -/* - * Update total_time_enabled and total_time_running for all counters in a group. - */ -static void update_group_times(struct perf_counter *leader) -{ - struct perf_counter *counter; - - update_counter_times(leader); - list_for_each_entry(counter, &leader->sibling_list, group_entry) - update_counter_times(counter); -} - -/* - * Cross CPU call to disable a performance counter - */ -static void __perf_counter_disable(void *info) -{ - struct perf_counter *counter = info; - struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); - struct perf_counter_context *ctx = counter->ctx; - - /* - * If this is a per-task counter, need to check whether this - * counter's task is the current task on this cpu. - */ - if (ctx->task && cpuctx->task_ctx != ctx) - return; - - spin_lock(&ctx->lock); - - /* - * If the counter is on, turn it off. - * If it is in error state, leave it in error state. - */ - if (counter->state >= PERF_COUNTER_STATE_INACTIVE) { - update_context_time(ctx); - update_group_times(counter); - if (counter == counter->group_leader) - group_sched_out(counter, cpuctx, ctx); - else - counter_sched_out(counter, cpuctx, ctx); - counter->state = PERF_COUNTER_STATE_OFF; - } - - spin_unlock(&ctx->lock); -} - -/* - * Disable a counter. - * - * If counter->ctx is a cloned context, callers must make sure that - * every task struct that counter->ctx->task could possibly point to - * remains valid. This condition is satisifed when called through - * perf_counter_for_each_child or perf_counter_for_each because they - * hold the top-level counter's child_mutex, so any descendant that - * goes to exit will block in sync_child_counter. - * When called from perf_pending_counter it's OK because counter->ctx - * is the current context on this CPU and preemption is disabled, - * hence we can't get into perf_counter_task_sched_out for this context. - */ -static void perf_counter_disable(struct perf_counter *counter) -{ - struct perf_counter_context *ctx = counter->ctx; - struct task_struct *task = ctx->task; - - if (!task) { - /* - * Disable the counter on the cpu that it's on - */ - smp_call_function_single(counter->cpu, __perf_counter_disable, - counter, 1); - return; - } - - retry: - task_oncpu_function_call(task, __perf_counter_disable, counter); - - spin_lock_irq(&ctx->lock); - /* - * If the counter is still active, we need to retry the cross-call. - */ - if (counter->state == PERF_COUNTER_STATE_ACTIVE) { - spin_unlock_irq(&ctx->lock); - goto retry; - } - - /* - * Since we have the lock this context can't be scheduled - * in, so we can change the state safely. - */ - if (counter->state == PERF_COUNTER_STATE_INACTIVE) { - update_group_times(counter); - counter->state = PERF_COUNTER_STATE_OFF; - } - - spin_unlock_irq(&ctx->lock); -} - -static int -counter_sched_in(struct perf_counter *counter, - struct perf_cpu_context *cpuctx, - struct perf_counter_context *ctx, - int cpu) -{ - if (counter->state <= PERF_COUNTER_STATE_OFF) - return 0; - - counter->state = PERF_COUNTER_STATE_ACTIVE; - counter->oncpu = cpu; /* TODO: put 'cpu' into cpuctx->cpu */ - /* - * The new state must be visible before we turn it on in the hardware: - */ - smp_wmb(); - - if (counter->pmu->enable(counter)) { - counter->state = PERF_COUNTER_STATE_INACTIVE; - counter->oncpu = -1; - return -EAGAIN; - } - - counter->tstamp_running += ctx->time - counter->tstamp_stopped; - - if (!is_software_counter(counter)) - cpuctx->active_oncpu++; - ctx->nr_active++; - - if (counter->attr.exclusive) - cpuctx->exclusive = 1; - - return 0; -} - -static int -group_sched_in(struct perf_counter *group_counter, - struct perf_cpu_context *cpuctx, - struct perf_counter_context *ctx, - int cpu) -{ - struct perf_counter *counter, *partial_group; - int ret; - - if (group_counter->state == PERF_COUNTER_STATE_OFF) - return 0; - - ret = hw_perf_group_sched_in(group_counter, cpuctx, ctx, cpu); - if (ret) - return ret < 0 ? ret : 0; - - if (counter_sched_in(group_counter, cpuctx, ctx, cpu)) - return -EAGAIN; - - /* - * Schedule in siblings as one group (if any): - */ - list_for_each_entry(counter, &group_counter->sibling_list, group_entry) { - if (counter_sched_in(counter, cpuctx, ctx, cpu)) { - partial_group = counter; - goto group_error; - } - } - - return 0; - -group_error: - /* - * Groups can be scheduled in as one unit only, so undo any - * partial group before returning: - */ - list_for_each_entry(counter, &group_counter->sibling_list, group_entry) { - if (counter == partial_group) - break; - counter_sched_out(counter, cpuctx, ctx); - } - counter_sched_out(group_counter, cpuctx, ctx); - - return -EAGAIN; -} - -/* - * Return 1 for a group consisting entirely of software counters, - * 0 if the group contains any hardware counters. - */ -static int is_software_only_group(struct perf_counter *leader) -{ - struct perf_counter *counter; - - if (!is_software_counter(leader)) - return 0; - - list_for_each_entry(counter, &leader->sibling_list, group_entry) - if (!is_software_counter(counter)) - return 0; - - return 1; -} - -/* - * Work out whether we can put this counter group on the CPU now. - */ -static int group_can_go_on(struct perf_counter *counter, - struct perf_cpu_context *cpuctx, - int can_add_hw) -{ - /* - * Groups consisting entirely of software counters can always go on. - */ - if (is_software_only_group(counter)) - return 1; - /* - * If an exclusive group is already on, no other hardware - * counters can go on. - */ - if (cpuctx->exclusive) - return 0; - /* - * If this group is exclusive and there are already - * counters on the CPU, it can't go on. - */ - if (counter->attr.exclusive && cpuctx->active_oncpu) - return 0; - /* - * Otherwise, try to add it if all previous groups were able - * to go on. - */ - return can_add_hw; -} - -static void add_counter_to_ctx(struct perf_counter *counter, - struct perf_counter_context *ctx) -{ - list_add_counter(counter, ctx); - counter->tstamp_enabled = ctx->time; - counter->tstamp_running = ctx->time; - counter->tstamp_stopped = ctx->time; -} - -/* - * Cross CPU call to install and enable a performance counter - * - * Must be called with ctx->mutex held - */ -static void __perf_install_in_context(void *info) -{ - struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); - struct perf_counter *counter = info; - struct perf_counter_context *ctx = counter->ctx; - struct perf_counter *leader = counter->group_leader; - int cpu = smp_processor_id(); - int err; - - /* - * If this is a task context, we need to check whether it is - * the current task context of this cpu. If not it has been - * scheduled out before the smp call arrived. - * Or possibly this is the right context but it isn't - * on this cpu because it had no counters. - */ - if (ctx->task && cpuctx->task_ctx != ctx) { - if (cpuctx->task_ctx || ctx->task != current) - return; - cpuctx->task_ctx = ctx; - } - - spin_lock(&ctx->lock); - ctx->is_active = 1; - update_context_time(ctx); - - /* - * Protect the list operation against NMI by disabling the - * counters on a global level. NOP for non NMI based counters. - */ - perf_disable(); - - add_counter_to_ctx(counter, ctx); - - /* - * Don't put the counter on if it is disabled or if - * it is in a group and the group isn't on. - */ - if (counter->state != PERF_COUNTER_STATE_INACTIVE || - (leader != counter && leader->state != PERF_COUNTER_STATE_ACTIVE)) - goto unlock; - - /* - * An exclusive counter can't go on if there are already active - * hardware counters, and no hardware counter can go on if there - * is already an exclusive counter on. - */ - if (!group_can_go_on(counter, cpuctx, 1)) - err = -EEXIST; - else - err = counter_sched_in(counter, cpuctx, ctx, cpu); - - if (err) { - /* - * This counter couldn't go on. If it is in a group - * then we have to pull the whole group off. - * If the counter group is pinned then put it in error state. - */ - if (leader != counter) - group_sched_out(leader, cpuctx, ctx); - if (leader->attr.pinned) { - update_group_times(leader); - leader->state = PERF_COUNTER_STATE_ERROR; - } - } - - if (!err && !ctx->task && cpuctx->max_pertask) - cpuctx->max_pertask--; - - unlock: - perf_enable(); - - spin_unlock(&ctx->lock); -} - -/* - * Attach a performance counter to a context - * - * First we add the counter to the list with the hardware enable bit - * in counter->hw_config cleared. - * - * If the counter is attached to a task which is on a CPU we use a smp - * call to enable it in the task context. The task might have been - * scheduled away, but we check this in the smp call again. - * - * Must be called with ctx->mutex held. - */ -static void -perf_install_in_context(struct perf_counter_context *ctx, - struct perf_counter *counter, - int cpu) -{ - struct task_struct *task = ctx->task; - - if (!task) { - /* - * Per cpu counters are installed via an smp call and - * the install is always sucessful. - */ - smp_call_function_single(cpu, __perf_install_in_context, - counter, 1); - return; - } - -retry: - task_oncpu_function_call(task, __perf_install_in_context, - counter); - - spin_lock_irq(&ctx->lock); - /* - * we need to retry the smp call. - */ - if (ctx->is_active && list_empty(&counter->group_entry)) { - spin_unlock_irq(&ctx->lock); - goto retry; - } - - /* - * The lock prevents that this context is scheduled in so we - * can add the counter safely, if it the call above did not - * succeed. - */ - if (list_empty(&counter->group_entry)) - add_counter_to_ctx(counter, ctx); - spin_unlock_irq(&ctx->lock); -} - -/* - * Put a counter into inactive state and update time fields. - * Enabling the leader of a group effectively enables all - * the group members that aren't explicitly disabled, so we - * have to update their ->tstamp_enabled also. - * Note: this works for group members as well as group leaders - * since the non-leader members' sibling_lists will be empty. - */ -static void __perf_counter_mark_enabled(struct perf_counter *counter, - struct perf_counter_context *ctx) -{ - struct perf_counter *sub; - - counter->state = PERF_COUNTER_STATE_INACTIVE; - counter->tstamp_enabled = ctx->time - counter->total_time_enabled; - list_for_each_entry(sub, &counter->sibling_list, group_entry) - if (sub->state >= PERF_COUNTER_STATE_INACTIVE) - sub->tstamp_enabled = - ctx->time - sub->total_time_enabled; -} - -/* - * Cross CPU call to enable a performance counter - */ -static void __perf_counter_enable(void *info) -{ - struct perf_counter *counter = info; - struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); - struct perf_counter_context *ctx = counter->ctx; - struct perf_counter *leader = counter->group_leader; - int err; - - /* - * If this is a per-task counter, need to check whether this - * counter's task is the current task on this cpu. - */ - if (ctx->task && cpuctx->task_ctx != ctx) { - if (cpuctx->task_ctx || ctx->task != current) - return; - cpuctx->task_ctx = ctx; - } - - spin_lock(&ctx->lock); - ctx->is_active = 1; - update_context_time(ctx); - - if (counter->state >= PERF_COUNTER_STATE_INACTIVE) - goto unlock; - __perf_counter_mark_enabled(counter, ctx); - - /* - * If the counter is in a group and isn't the group leader, - * then don't put it on unless the group is on. - */ - if (leader != counter && leader->state != PERF_COUNTER_STATE_ACTIVE) - goto unlock; - - if (!group_can_go_on(counter, cpuctx, 1)) { - err = -EEXIST; - } else { - perf_disable(); - if (counter == leader) - err = group_sched_in(counter, cpuctx, ctx, - smp_processor_id()); - else - err = counter_sched_in(counter, cpuctx, ctx, - smp_processor_id()); - perf_enable(); - } - - if (err) { - /* - * If this counter can't go on and it's part of a - * group, then the whole group has to come off. - */ - if (leader != counter) - group_sched_out(leader, cpuctx, ctx); - if (leader->attr.pinned) { - update_group_times(leader); - leader->state = PERF_COUNTER_STATE_ERROR; - } - } - - unlock: - spin_unlock(&ctx->lock); -} - -/* - * Enable a counter. - * - * If counter->ctx is a cloned context, callers must make sure that - * every task struct that counter->ctx->task could possibly point to - * remains valid. This condition is satisfied when called through - * perf_counter_for_each_child or perf_counter_for_each as described - * for perf_counter_disable. - */ -static void perf_counter_enable(struct perf_counter *counter) -{ - struct perf_counter_context *ctx = counter->ctx; - struct task_struct *task = ctx->task; - - if (!task) { - /* - * Enable the counter on the cpu that it's on - */ - smp_call_function_single(counter->cpu, __perf_counter_enable, - counter, 1); - return; - } - - spin_lock_irq(&ctx->lock); - if (counter->state >= PERF_COUNTER_STATE_INACTIVE) - goto out; - - /* - * If the counter is in error state, clear that first. - * That way, if we see the counter in error state below, we - * know that it has gone back into error state, as distinct - * from the task having been scheduled away before the - * cross-call arrived. - */ - if (counter->state == PERF_COUNTER_STATE_ERROR) - counter->state = PERF_COUNTER_STATE_OFF; - - retry: - spin_unlock_irq(&ctx->lock); - task_oncpu_function_call(task, __perf_counter_enable, counter); - - spin_lock_irq(&ctx->lock); - - /* - * If the context is active and the counter is still off, - * we need to retry the cross-call. - */ - if (ctx->is_active && counter->state == PERF_COUNTER_STATE_OFF) - goto retry; - - /* - * Since we have the lock this context can't be scheduled - * in, so we can change the state safely. - */ - if (counter->state == PERF_COUNTER_STATE_OFF) - __perf_counter_mark_enabled(counter, ctx); - - out: - spin_unlock_irq(&ctx->lock); -} - -static int perf_counter_refresh(struct perf_counter *counter, int refresh) -{ - /* - * not supported on inherited counters - */ - if (counter->attr.inherit) - return -EINVAL; - - atomic_add(refresh, &counter->event_limit); - perf_counter_enable(counter); - - return 0; -} - -void __perf_counter_sched_out(struct perf_counter_context *ctx, - struct perf_cpu_context *cpuctx) -{ - struct perf_counter *counter; - - spin_lock(&ctx->lock); - ctx->is_active = 0; - if (likely(!ctx->nr_counters)) - goto out; - update_context_time(ctx); - - perf_disable(); - if (ctx->nr_active) { - list_for_each_entry(counter, &ctx->group_list, group_entry) { - if (counter != counter->group_leader) - counter_sched_out(counter, cpuctx, ctx); - else - group_sched_out(counter, cpuctx, ctx); - } - } - perf_enable(); - out: - spin_unlock(&ctx->lock); -} - -/* - * Test whether two contexts are equivalent, i.e. whether they - * have both been cloned from the same version of the same context - * and they both have the same number of enabled counters. - * If the number of enabled counters is the same, then the set - * of enabled counters should be the same, because these are both - * inherited contexts, therefore we can't access individual counters - * in them directly with an fd; we can only enable/disable all - * counters via prctl, or enable/disable all counters in a family - * via ioctl, which will have the same effect on both contexts. - */ -static int context_equiv(struct perf_counter_context *ctx1, - struct perf_counter_context *ctx2) -{ - return ctx1->parent_ctx && ctx1->parent_ctx == ctx2->parent_ctx - && ctx1->parent_gen == ctx2->parent_gen - && !ctx1->pin_count && !ctx2->pin_count; -} - -static void __perf_counter_read(void *counter); - -static void __perf_counter_sync_stat(struct perf_counter *counter, - struct perf_counter *next_counter) -{ - u64 value; - - if (!counter->attr.inherit_stat) - return; - - /* - * Update the counter value, we cannot use perf_counter_read() - * because we're in the middle of a context switch and have IRQs - * disabled, which upsets smp_call_function_single(), however - * we know the counter must be on the current CPU, therefore we - * don't need to use it. - */ - switch (counter->state) { - case PERF_COUNTER_STATE_ACTIVE: - __perf_counter_read(counter); - break; - - case PERF_COUNTER_STATE_INACTIVE: - update_counter_times(counter); - break; - - default: - break; - } - - /* - * In order to keep per-task stats reliable we need to flip the counter - * values when we flip the contexts. - */ - value = atomic64_read(&next_counter->count); - value = atomic64_xchg(&counter->count, value); - atomic64_set(&next_counter->count, value); - - swap(counter->total_time_enabled, next_counter->total_time_enabled); - swap(counter->total_time_running, next_counter->total_time_running); - - /* - * Since we swizzled the values, update the user visible data too. - */ - perf_counter_update_userpage(counter); - perf_counter_update_userpage(next_counter); -} - -#define list_next_entry(pos, member) \ - list_entry(pos->member.next, typeof(*pos), member) - -static void perf_counter_sync_stat(struct perf_counter_context *ctx, - struct perf_counter_context *next_ctx) -{ - struct perf_counter *counter, *next_counter; - - if (!ctx->nr_stat) - return; - - counter = list_first_entry(&ctx->event_list, - struct perf_counter, event_entry); - - next_counter = list_first_entry(&next_ctx->event_list, - struct perf_counter, event_entry); - - while (&counter->event_entry != &ctx->event_list && - &next_counter->event_entry != &next_ctx->event_list) { - - __perf_counter_sync_stat(counter, next_counter); - - counter = list_next_entry(counter, event_entry); - next_counter = list_next_entry(next_counter, event_entry); - } -} - -/* - * Called from scheduler to remove the counters of the current task, - * with interrupts disabled. - * - * We stop each counter and update the counter value in counter->count. - * - * This does not protect us against NMI, but disable() - * sets the disabled bit in the control field of counter _before_ - * accessing the counter control register. If a NMI hits, then it will - * not restart the counter. - */ -void perf_counter_task_sched_out(struct task_struct *task, - struct task_struct *next, int cpu) -{ - struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu); - struct perf_counter_context *ctx = task->perf_counter_ctxp; - struct perf_counter_context *next_ctx; - struct perf_counter_context *parent; - struct pt_regs *regs; - int do_switch = 1; - - regs = task_pt_regs(task); - perf_swcounter_event(PERF_COUNT_SW_CONTEXT_SWITCHES, 1, 1, regs, 0); - - if (likely(!ctx || !cpuctx->task_ctx)) - return; - - update_context_time(ctx); - - rcu_read_lock(); - parent = rcu_dereference(ctx->parent_ctx); - next_ctx = next->perf_counter_ctxp; - if (parent && next_ctx && - rcu_dereference(next_ctx->parent_ctx) == parent) { - /* - * Looks like the two contexts are clones, so we might be - * able to optimize the context switch. We lock both - * contexts and check that they are clones under the - * lock (including re-checking that neither has been - * uncloned in the meantime). It doesn't matter which - * order we take the locks because no other cpu could - * be trying to lock both of these tasks. - */ - spin_lock(&ctx->lock); - spin_lock_nested(&next_ctx->lock, SINGLE_DEPTH_NESTING); - if (context_equiv(ctx, next_ctx)) { - /* - * XXX do we need a memory barrier of sorts - * wrt to rcu_dereference() of perf_counter_ctxp - */ - task->perf_counter_ctxp = next_ctx; - next->perf_counter_ctxp = ctx; - ctx->task = next; - next_ctx->task = task; - do_switch = 0; - - perf_counter_sync_stat(ctx, next_ctx); - } - spin_unlock(&next_ctx->lock); - spin_unlock(&ctx->lock); - } - rcu_read_unlock(); - - if (do_switch) { - __perf_counter_sched_out(ctx, cpuctx); - cpuctx->task_ctx = NULL; - } -} - -/* - * Called with IRQs disabled - */ -static void __perf_counter_task_sched_out(struct perf_counter_context *ctx) -{ - struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); - - if (!cpuctx->task_ctx) - return; - - if (WARN_ON_ONCE(ctx != cpuctx->task_ctx)) - return; - - __perf_counter_sched_out(ctx, cpuctx); - cpuctx->task_ctx = NULL; -} - -/* - * Called with IRQs disabled - */ -static void perf_counter_cpu_sched_out(struct perf_cpu_context *cpuctx) -{ - __perf_counter_sched_out(&cpuctx->ctx, cpuctx); -} - -static void -__perf_counter_sched_in(struct perf_counter_context *ctx, - struct perf_cpu_context *cpuctx, int cpu) -{ - struct perf_counter *counter; - int can_add_hw = 1; - - spin_lock(&ctx->lock); - ctx->is_active = 1; - if (likely(!ctx->nr_counters)) - goto out; - - ctx->timestamp = perf_clock(); - - perf_disable(); - - /* - * First go through the list and put on any pinned groups - * in order to give them the best chance of going on. - */ - list_for_each_entry(counter, &ctx->group_list, group_entry) { - if (counter->state <= PERF_COUNTER_STATE_OFF || - !counter->attr.pinned) - continue; - if (counter->cpu != -1 && counter->cpu != cpu) - continue; - - if (counter != counter->group_leader) - counter_sched_in(counter, cpuctx, ctx, cpu); - else { - if (group_can_go_on(counter, cpuctx, 1)) - group_sched_in(counter, cpuctx, ctx, cpu); - } - - /* - * If this pinned group hasn't been scheduled, - * put it in error state. - */ - if (counter->state == PERF_COUNTER_STATE_INACTIVE) { - update_group_times(counter); - counter->state = PERF_COUNTER_STATE_ERROR; - } - } - - list_for_each_entry(counter, &ctx->group_list, group_entry) { - /* - * Ignore counters in OFF or ERROR state, and - * ignore pinned counters since we did them already. - */ - if (counter->state <= PERF_COUNTER_STATE_OFF || - counter->attr.pinned) - continue; - - /* - * Listen to the 'cpu' scheduling filter constraint - * of counters: - */ - if (counter->cpu != -1 && counter->cpu != cpu) - continue; - - if (counter != counter->group_leader) { - if (counter_sched_in(counter, cpuctx, ctx, cpu)) - can_add_hw = 0; - } else { - if (group_can_go_on(counter, cpuctx, can_add_hw)) { - if (group_sched_in(counter, cpuctx, ctx, cpu)) - can_add_hw = 0; - } - } - } - perf_enable(); - out: - spin_unlock(&ctx->lock); -} - -/* - * Called from scheduler to add the counters of the current task - * with interrupts disabled. - * - * We restore the counter value and then enable it. - * - * This does not protect us against NMI, but enable() - * sets the enabled bit in the control field of counter _before_ - * accessing the counter control register. If a NMI hits, then it will - * keep the counter running. - */ -void perf_counter_task_sched_in(struct task_struct *task, int cpu) -{ - struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu); - struct perf_counter_context *ctx = task->perf_counter_ctxp; - - if (likely(!ctx)) - return; - if (cpuctx->task_ctx == ctx) - return; - __perf_counter_sched_in(ctx, cpuctx, cpu); - cpuctx->task_ctx = ctx; -} - -static void perf_counter_cpu_sched_in(struct perf_cpu_context *cpuctx, int cpu) -{ - struct perf_counter_context *ctx = &cpuctx->ctx; - - __perf_counter_sched_in(ctx, cpuctx, cpu); -} - -#define MAX_INTERRUPTS (~0ULL) - -static void perf_log_throttle(struct perf_counter *counter, int enable); - -static void perf_adjust_period(struct perf_counter *counter, u64 events) -{ - struct hw_perf_counter *hwc = &counter->hw; - u64 period, sample_period; - s64 delta; - - events *= hwc->sample_period; - period = div64_u64(events, counter->attr.sample_freq); - - delta = (s64)(period - hwc->sample_period); - delta = (delta + 7) / 8; /* low pass filter */ - - sample_period = hwc->sample_period + delta; - - if (!sample_period) - sample_period = 1; - - hwc->sample_period = sample_period; -} - -static void perf_ctx_adjust_freq(struct perf_counter_context *ctx) -{ - struct perf_counter *counter; - struct hw_perf_counter *hwc; - u64 interrupts, freq; - - spin_lock(&ctx->lock); - list_for_each_entry(counter, &ctx->group_list, group_entry) { - if (counter->state != PERF_COUNTER_STATE_ACTIVE) - continue; - - hwc = &counter->hw; - - interrupts = hwc->interrupts; - hwc->interrupts = 0; - - /* - * unthrottle counters on the tick - */ - if (interrupts == MAX_INTERRUPTS) { - perf_log_throttle(counter, 1); - counter->pmu->unthrottle(counter); - interrupts = 2*sysctl_perf_counter_sample_rate/HZ; - } - - if (!counter->attr.freq || !counter->attr.sample_freq) - continue; - - /* - * if the specified freq < HZ then we need to skip ticks - */ - if (counter->attr.sample_freq < HZ) { - freq = counter->attr.sample_freq; - - hwc->freq_count += freq; - hwc->freq_interrupts += interrupts; - - if (hwc->freq_count < HZ) - continue; - - interrupts = hwc->freq_interrupts; - hwc->freq_interrupts = 0; - hwc->freq_count -= HZ; - } else - freq = HZ; - - perf_adjust_period(counter, freq * interrupts); - - /* - * In order to avoid being stalled by an (accidental) huge - * sample period, force reset the sample period if we didn't - * get any events in this freq period. - */ - if (!interrupts) { - perf_disable(); - counter->pmu->disable(counter); - atomic64_set(&hwc->period_left, 0); - counter->pmu->enable(counter); - perf_enable(); - } - } - spin_unlock(&ctx->lock); -} - -/* - * Round-robin a context's counters: - */ -static void rotate_ctx(struct perf_counter_context *ctx) -{ - struct perf_counter *counter; - - if (!ctx->nr_counters) - return; - - spin_lock(&ctx->lock); - /* - * Rotate the first entry last (works just fine for group counters too): - */ - perf_disable(); - list_for_each_entry(counter, &ctx->group_list, group_entry) { - list_move_tail(&counter->group_entry, &ctx->group_list); - break; - } - perf_enable(); - - spin_unlock(&ctx->lock); -} - -void perf_counter_task_tick(struct task_struct *curr, int cpu) -{ - struct perf_cpu_context *cpuctx; - struct perf_counter_context *ctx; - - if (!atomic_read(&nr_counters)) - return; - - cpuctx = &per_cpu(perf_cpu_context, cpu); - ctx = curr->perf_counter_ctxp; - - perf_ctx_adjust_freq(&cpuctx->ctx); - if (ctx) - perf_ctx_adjust_freq(ctx); - - perf_counter_cpu_sched_out(cpuctx); - if (ctx) - __perf_counter_task_sched_out(ctx); - - rotate_ctx(&cpuctx->ctx); - if (ctx) - rotate_ctx(ctx); - - perf_counter_cpu_sched_in(cpuctx, cpu); - if (ctx) - perf_counter_task_sched_in(curr, cpu); -} - -/* - * Enable all of a task's counters that have been marked enable-on-exec. - * This expects task == current. - */ -static void perf_counter_enable_on_exec(struct task_struct *task) -{ - struct perf_counter_context *ctx; - struct perf_counter *counter; - unsigned long flags; - int enabled = 0; - - local_irq_save(flags); - ctx = task->perf_counter_ctxp; - if (!ctx || !ctx->nr_counters) - goto out; - - __perf_counter_task_sched_out(ctx); - - spin_lock(&ctx->lock); - - list_for_each_entry(counter, &ctx->group_list, group_entry) { - if (!counter->attr.enable_on_exec) - continue; - counter->attr.enable_on_exec = 0; - if (counter->state >= PERF_COUNTER_STATE_INACTIVE) - continue; - __perf_counter_mark_enabled(counter, ctx); - enabled = 1; - } - - /* - * Unclone this context if we enabled any counter. - */ - if (enabled) - unclone_ctx(ctx); - - spin_unlock(&ctx->lock); - - perf_counter_task_sched_in(task, smp_processor_id()); - out: - local_irq_restore(flags); -} - -/* - * Cross CPU call to read the hardware counter - */ -static void __perf_counter_read(void *info) -{ - struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); - struct perf_counter *counter = info; - struct perf_counter_context *ctx = counter->ctx; - unsigned long flags; - - /* - * If this is a task context, we need to check whether it is - * the current task context of this cpu. If not it has been - * scheduled out before the smp call arrived. In that case - * counter->count would have been updated to a recent sample - * when the counter was scheduled out. - */ - if (ctx->task && cpuctx->task_ctx != ctx) - return; - - local_irq_save(flags); - if (ctx->is_active) - update_context_time(ctx); - counter->pmu->read(counter); - update_counter_times(counter); - local_irq_restore(flags); -} - -static u64 perf_counter_read(struct perf_counter *counter) -{ - /* - * If counter is enabled and currently active on a CPU, update the - * value in the counter structure: - */ - if (counter->state == PERF_COUNTER_STATE_ACTIVE) { - smp_call_function_single(counter->oncpu, - __perf_counter_read, counter, 1); - } else if (counter->state == PERF_COUNTER_STATE_INACTIVE) { - update_counter_times(counter); - } - - return atomic64_read(&counter->count); -} - -/* - * Initialize the perf_counter context in a task_struct: - */ -static void -__perf_counter_init_context(struct perf_counter_context *ctx, - struct task_struct *task) -{ - memset(ctx, 0, sizeof(*ctx)); - spin_lock_init(&ctx->lock); - mutex_init(&ctx->mutex); - INIT_LIST_HEAD(&ctx->group_list); - INIT_LIST_HEAD(&ctx->event_list); - atomic_set(&ctx->refcount, 1); - ctx->task = task; -} - -static struct perf_counter_context *find_get_context(pid_t pid, int cpu) -{ - struct perf_counter_context *ctx; - struct perf_cpu_context *cpuctx; - struct task_struct *task; - unsigned long flags; - int err; - - /* - * If cpu is not a wildcard then this is a percpu counter: - */ - if (cpu != -1) { - /* Must be root to operate on a CPU counter: */ - if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN)) - return ERR_PTR(-EACCES); - - if (cpu < 0 || cpu > num_possible_cpus()) - return ERR_PTR(-EINVAL); - - /* - * We could be clever and allow to attach a counter to an - * offline CPU and activate it when the CPU comes up, but - * that's for later. - */ - if (!cpu_isset(cpu, cpu_online_map)) - return ERR_PTR(-ENODEV); - - cpuctx = &per_cpu(perf_cpu_context, cpu); - ctx = &cpuctx->ctx; - get_ctx(ctx); - - return ctx; - } - - rcu_read_lock(); - if (!pid) - task = current; - else - task = find_task_by_vpid(pid); - if (task) - get_task_struct(task); - rcu_read_unlock(); - - if (!task) - return ERR_PTR(-ESRCH); - - /* - * Can't attach counters to a dying task. - */ - err = -ESRCH; - if (task->flags & PF_EXITING) - goto errout; - - /* Reuse ptrace permission checks for now. */ - err = -EACCES; - if (!ptrace_may_access(task, PTRACE_MODE_READ)) - goto errout; - - retry: - ctx = perf_lock_task_context(task, &flags); - if (ctx) { - unclone_ctx(ctx); - spin_unlock_irqrestore(&ctx->lock, flags); - } - - if (!ctx) { - ctx = kmalloc(sizeof(struct perf_counter_context), GFP_KERNEL); - err = -ENOMEM; - if (!ctx) - goto errout; - __perf_counter_init_context(ctx, task); - get_ctx(ctx); - if (cmpxchg(&task->perf_counter_ctxp, NULL, ctx)) { - /* - * We raced with some other task; use - * the context they set. - */ - kfree(ctx); - goto retry; - } - get_task_struct(task); - } - - put_task_struct(task); - return ctx; - - errout: - put_task_struct(task); - return ERR_PTR(err); -} - -static void free_counter_rcu(struct rcu_head *head) -{ - struct perf_counter *counter; - - counter = container_of(head, struct perf_counter, rcu_head); - if (counter->ns) - put_pid_ns(counter->ns); - kfree(counter); -} - -static void perf_pending_sync(struct perf_counter *counter); - -static void free_counter(struct perf_counter *counter) -{ - perf_pending_sync(counter); - - if (!counter->parent) { - atomic_dec(&nr_counters); - if (counter->attr.mmap) - atomic_dec(&nr_mmap_counters); - if (counter->attr.comm) - atomic_dec(&nr_comm_counters); - if (counter->attr.task) - atomic_dec(&nr_task_counters); - } - - if (counter->output) { - fput(counter->output->filp); - counter->output = NULL; - } - - if (counter->destroy) - counter->destroy(counter); - - put_ctx(counter->ctx); - call_rcu(&counter->rcu_head, free_counter_rcu); -} - -/* - * Called when the last reference to the file is gone. - */ -static int perf_release(struct inode *inode, struct file *file) -{ - struct perf_counter *counter = file->private_data; - struct perf_counter_context *ctx = counter->ctx; - - file->private_data = NULL; - - WARN_ON_ONCE(ctx->parent_ctx); - mutex_lock(&ctx->mutex); - perf_counter_remove_from_context(counter); - mutex_unlock(&ctx->mutex); - - mutex_lock(&counter->owner->perf_counter_mutex); - list_del_init(&counter->owner_entry); - mutex_unlock(&counter->owner->perf_counter_mutex); - put_task_struct(counter->owner); - - free_counter(counter); - - return 0; -} - -static int perf_counter_read_size(struct perf_counter *counter) -{ - int entry = sizeof(u64); /* value */ - int size = 0; - int nr = 1; - - if (counter->attr.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) - size += sizeof(u64); - - if (counter->attr.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) - size += sizeof(u64); - - if (counter->attr.read_format & PERF_FORMAT_ID) - entry += sizeof(u64); - - if (counter->attr.read_format & PERF_FORMAT_GROUP) { - nr += counter->group_leader->nr_siblings; - size += sizeof(u64); - } - - size += entry * nr; - - return size; -} - -static u64 perf_counter_read_value(struct perf_counter *counter) -{ - struct perf_counter *child; - u64 total = 0; - - total += perf_counter_read(counter); - list_for_each_entry(child, &counter->child_list, child_list) - total += perf_counter_read(child); - - return total; -} - -static int perf_counter_read_entry(struct perf_counter *counter, - u64 read_format, char __user *buf) -{ - int n = 0, count = 0; - u64 values[2]; - - values[n++] = perf_counter_read_value(counter); - if (read_format & PERF_FORMAT_ID) - values[n++] = primary_counter_id(counter); - - count = n * sizeof(u64); - - if (copy_to_user(buf, values, count)) - return -EFAULT; - - return count; -} - -static int perf_counter_read_group(struct perf_counter *counter, - u64 read_format, char __user *buf) -{ - struct perf_counter *leader = counter->group_leader, *sub; - int n = 0, size = 0, err = -EFAULT; - u64 values[3]; - - values[n++] = 1 + leader->nr_siblings; - if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) { - values[n++] = leader->total_time_enabled + - atomic64_read(&leader->child_total_time_enabled); - } - if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) { - values[n++] = leader->total_time_running + - atomic64_read(&leader->child_total_time_running); - } - - size = n * sizeof(u64); - - if (copy_to_user(buf, values, size)) - return -EFAULT; - - err = perf_counter_read_entry(leader, read_format, buf + size); - if (err < 0) - return err; - - size += err; - - list_for_each_entry(sub, &leader->sibling_list, group_entry) { - err = perf_counter_read_entry(sub, read_format, - buf + size); - if (err < 0) - return err; - - size += err; - } - - return size; -} - -static int perf_counter_read_one(struct perf_counter *counter, - u64 read_format, char __user *buf) -{ - u64 values[4]; - int n = 0; - - values[n++] = perf_counter_read_value(counter); - if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) { - values[n++] = counter->total_time_enabled + - atomic64_read(&counter->child_total_time_enabled); - } - if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) { - values[n++] = counter->total_time_running + - atomic64_read(&counter->child_total_time_running); - } - if (read_format & PERF_FORMAT_ID) - values[n++] = primary_counter_id(counter); - - if (copy_to_user(buf, values, n * sizeof(u64))) - return -EFAULT; - - return n * sizeof(u64); -} - -/* - * Read the performance counter - simple non blocking version for now - */ -static ssize_t -perf_read_hw(struct perf_counter *counter, char __user *buf, size_t count) -{ - u64 read_format = counter->attr.read_format; - int ret; - - /* - * Return end-of-file for a read on a counter that is in - * error state (i.e. because it was pinned but it couldn't be - * scheduled on to the CPU at some point). - */ - if (counter->state == PERF_COUNTER_STATE_ERROR) - return 0; - - if (count < perf_counter_read_size(counter)) - return -ENOSPC; - - WARN_ON_ONCE(counter->ctx->parent_ctx); - mutex_lock(&counter->child_mutex); - if (read_format & PERF_FORMAT_GROUP) - ret = perf_counter_read_group(counter, read_format, buf); - else - ret = perf_counter_read_one(counter, read_format, buf); - mutex_unlock(&counter->child_mutex); - - return ret; -} - -static ssize_t -perf_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) -{ - struct perf_counter *counter = file->private_data; - - return perf_read_hw(counter, buf, count); -} - -static unsigned int perf_poll(struct file *file, poll_table *wait) -{ - struct perf_counter *counter = file->private_data; - struct perf_mmap_data *data; - unsigned int events = POLL_HUP; - - rcu_read_lock(); - data = rcu_dereference(counter->data); - if (data) - events = atomic_xchg(&data->poll, 0); - rcu_read_unlock(); - - poll_wait(file, &counter->waitq, wait); - - return events; -} - -static void perf_counter_reset(struct perf_counter *counter) -{ - (void)perf_counter_read(counter); - atomic64_set(&counter->count, 0); - perf_counter_update_userpage(counter); -} - -/* - * Holding the top-level counter's child_mutex means that any - * descendant process that has inherited this counter will block - * in sync_child_counter if it goes to exit, thus satisfying the - * task existence requirements of perf_counter_enable/disable. - */ -static void perf_counter_for_each_child(struct perf_counter *counter, - void (*func)(struct perf_counter *)) -{ - struct perf_counter *child; - - WARN_ON_ONCE(counter->ctx->parent_ctx); - mutex_lock(&counter->child_mutex); - func(counter); - list_for_each_entry(child, &counter->child_list, child_list) - func(child); - mutex_unlock(&counter->child_mutex); -} - -static void perf_counter_for_each(struct perf_counter *counter, - void (*func)(struct perf_counter *)) -{ - struct perf_counter_context *ctx = counter->ctx; - struct perf_counter *sibling; - - WARN_ON_ONCE(ctx->parent_ctx); - mutex_lock(&ctx->mutex); - counter = counter->group_leader; - - perf_counter_for_each_child(counter, func); - func(counter); - list_for_each_entry(sibling, &counter->sibling_list, group_entry) - perf_counter_for_each_child(counter, func); - mutex_unlock(&ctx->mutex); -} - -static int perf_counter_period(struct perf_counter *counter, u64 __user *arg) -{ - struct perf_counter_context *ctx = counter->ctx; - unsigned long size; - int ret = 0; - u64 value; - - if (!counter->attr.sample_period) - return -EINVAL; - - size = copy_from_user(&value, arg, sizeof(value)); - if (size != sizeof(value)) - return -EFAULT; - - if (!value) - return -EINVAL; - - spin_lock_irq(&ctx->lock); - if (counter->attr.freq) { - if (value > sysctl_perf_counter_sample_rate) { - ret = -EINVAL; - goto unlock; - } - - counter->attr.sample_freq = value; - } else { - counter->attr.sample_period = value; - counter->hw.sample_period = value; - } -unlock: - spin_unlock_irq(&ctx->lock); - - return ret; -} - -int perf_counter_set_output(struct perf_counter *counter, int output_fd); - -static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg) -{ - struct perf_counter *counter = file->private_data; - void (*func)(struct perf_counter *); - u32 flags = arg; - - switch (cmd) { - case PERF_COUNTER_IOC_ENABLE: - func = perf_counter_enable; - break; - case PERF_COUNTER_IOC_DISABLE: - func = perf_counter_disable; - break; - case PERF_COUNTER_IOC_RESET: - func = perf_counter_reset; - break; - - case PERF_COUNTER_IOC_REFRESH: - return perf_counter_refresh(counter, arg); - - case PERF_COUNTER_IOC_PERIOD: - return perf_counter_period(counter, (u64 __user *)arg); - - case PERF_COUNTER_IOC_SET_OUTPUT: - return perf_counter_set_output(counter, arg); - - default: - return -ENOTTY; - } - - if (flags & PERF_IOC_FLAG_GROUP) - perf_counter_for_each(counter, func); - else - perf_counter_for_each_child(counter, func); - - return 0; -} - -int perf_counter_task_enable(void) -{ - struct perf_counter *counter; - - mutex_lock(¤t->perf_counter_mutex); - list_for_each_entry(counter, ¤t->perf_counter_list, owner_entry) - perf_counter_for_each_child(counter, perf_counter_enable); - mutex_unlock(¤t->perf_counter_mutex); - - return 0; -} - -int perf_counter_task_disable(void) -{ - struct perf_counter *counter; - - mutex_lock(¤t->perf_counter_mutex); - list_for_each_entry(counter, ¤t->perf_counter_list, owner_entry) - perf_counter_for_each_child(counter, perf_counter_disable); - mutex_unlock(¤t->perf_counter_mutex); - - return 0; -} - -#ifndef PERF_COUNTER_INDEX_OFFSET -# define PERF_COUNTER_INDEX_OFFSET 0 -#endif - -static int perf_counter_index(struct perf_counter *counter) -{ - if (counter->state != PERF_COUNTER_STATE_ACTIVE) - return 0; - - return counter->hw.idx + 1 - PERF_COUNTER_INDEX_OFFSET; -} - -/* - * Callers need to ensure there can be no nesting of this function, otherwise - * the seqlock logic goes bad. We can not serialize this because the arch - * code calls this from NMI context. - */ -void perf_counter_update_userpage(struct perf_counter *counter) -{ - struct perf_counter_mmap_page *userpg; - struct perf_mmap_data *data; - - rcu_read_lock(); - data = rcu_dereference(counter->data); - if (!data) - goto unlock; - - userpg = data->user_page; - - /* - * Disable preemption so as to not let the corresponding user-space - * spin too long if we get preempted. - */ - preempt_disable(); - ++userpg->lock; - barrier(); - userpg->index = perf_counter_index(counter); - userpg->offset = atomic64_read(&counter->count); - if (counter->state == PERF_COUNTER_STATE_ACTIVE) - userpg->offset -= atomic64_read(&counter->hw.prev_count); - - userpg->time_enabled = counter->total_time_enabled + - atomic64_read(&counter->child_total_time_enabled); - - userpg->time_running = counter->total_time_running + - atomic64_read(&counter->child_total_time_running); - - barrier(); - ++userpg->lock; - preempt_enable(); -unlock: - rcu_read_unlock(); -} - -static int perf_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) -{ - struct perf_counter *counter = vma->vm_file->private_data; - struct perf_mmap_data *data; - int ret = VM_FAULT_SIGBUS; - - if (vmf->flags & FAULT_FLAG_MKWRITE) { - if (vmf->pgoff == 0) - ret = 0; - return ret; - } - - rcu_read_lock(); - data = rcu_dereference(counter->data); - if (!data) - goto unlock; - - if (vmf->pgoff == 0) { - vmf->page = virt_to_page(data->user_page); - } else { - int nr = vmf->pgoff - 1; - - if ((unsigned)nr > data->nr_pages) - goto unlock; - - if (vmf->flags & FAULT_FLAG_WRITE) - goto unlock; - - vmf->page = virt_to_page(data->data_pages[nr]); - } - - get_page(vmf->page); - vmf->page->mapping = vma->vm_file->f_mapping; - vmf->page->index = vmf->pgoff; - - ret = 0; -unlock: - rcu_read_unlock(); - - return ret; -} - -static int perf_mmap_data_alloc(struct perf_counter *counter, int nr_pages) -{ - struct perf_mmap_data *data; - unsigned long size; - int i; - - WARN_ON(atomic_read(&counter->mmap_count)); - - size = sizeof(struct perf_mmap_data); - size += nr_pages * sizeof(void *); - - data = kzalloc(size, GFP_KERNEL); - if (!data) - goto fail; - - data->user_page = (void *)get_zeroed_page(GFP_KERNEL); - if (!data->user_page) - goto fail_user_page; - - for (i = 0; i < nr_pages; i++) { - data->data_pages[i] = (void *)get_zeroed_page(GFP_KERNEL); - if (!data->data_pages[i]) - goto fail_data_pages; - } - - data->nr_pages = nr_pages; - atomic_set(&data->lock, -1); - - if (counter->attr.watermark) { - data->watermark = min_t(long, PAGE_SIZE * nr_pages, - counter->attr.wakeup_watermark); - } - if (!data->watermark) - data->watermark = max(PAGE_SIZE, PAGE_SIZE * nr_pages / 4); - - rcu_assign_pointer(counter->data, data); - - return 0; - -fail_data_pages: - for (i--; i >= 0; i--) - free_page((unsigned long)data->data_pages[i]); - - free_page((unsigned long)data->user_page); - -fail_user_page: - kfree(data); - -fail: - return -ENOMEM; -} - -static void perf_mmap_free_page(unsigned long addr) -{ - struct page *page = virt_to_page((void *)addr); - - page->mapping = NULL; - __free_page(page); -} - -static void __perf_mmap_data_free(struct rcu_head *rcu_head) -{ - struct perf_mmap_data *data; - int i; - - data = container_of(rcu_head, struct perf_mmap_data, rcu_head); - - perf_mmap_free_page((unsigned long)data->user_page); - for (i = 0; i < data->nr_pages; i++) - perf_mmap_free_page((unsigned long)data->data_pages[i]); - - kfree(data); -} - -static void perf_mmap_data_free(struct perf_counter *counter) -{ - struct perf_mmap_data *data = counter->data; - - WARN_ON(atomic_read(&counter->mmap_count)); - - rcu_assign_pointer(counter->data, NULL); - call_rcu(&data->rcu_head, __perf_mmap_data_free); -} - -static void perf_mmap_open(struct vm_area_struct *vma) -{ - struct perf_counter *counter = vma->vm_file->private_data; - - atomic_inc(&counter->mmap_count); -} - -static void perf_mmap_close(struct vm_area_struct *vma) -{ - struct perf_counter *counter = vma->vm_file->private_data; - - WARN_ON_ONCE(counter->ctx->parent_ctx); - if (atomic_dec_and_mutex_lock(&counter->mmap_count, &counter->mmap_mutex)) { - struct user_struct *user = current_user(); - - atomic_long_sub(counter->data->nr_pages + 1, &user->locked_vm); - vma->vm_mm->locked_vm -= counter->data->nr_locked; - perf_mmap_data_free(counter); - mutex_unlock(&counter->mmap_mutex); - } -} - -static struct vm_operations_struct perf_mmap_vmops = { - .open = perf_mmap_open, - .close = perf_mmap_close, - .fault = perf_mmap_fault, - .page_mkwrite = perf_mmap_fault, -}; - -static int perf_mmap(struct file *file, struct vm_area_struct *vma) -{ - struct perf_counter *counter = file->private_data; - unsigned long user_locked, user_lock_limit; - struct user_struct *user = current_user(); - unsigned long locked, lock_limit; - unsigned long vma_size; - unsigned long nr_pages; - long user_extra, extra; - int ret = 0; - - if (!(vma->vm_flags & VM_SHARED)) - return -EINVAL; - - vma_size = vma->vm_end - vma->vm_start; - nr_pages = (vma_size / PAGE_SIZE) - 1; - - /* - * If we have data pages ensure they're a power-of-two number, so we - * can do bitmasks instead of modulo. - */ - if (nr_pages != 0 && !is_power_of_2(nr_pages)) - return -EINVAL; - - if (vma_size != PAGE_SIZE * (1 + nr_pages)) - return -EINVAL; - - if (vma->vm_pgoff != 0) - return -EINVAL; - - WARN_ON_ONCE(counter->ctx->parent_ctx); - mutex_lock(&counter->mmap_mutex); - if (counter->output) { - ret = -EINVAL; - goto unlock; - } - - if (atomic_inc_not_zero(&counter->mmap_count)) { - if (nr_pages != counter->data->nr_pages) - ret = -EINVAL; - goto unlock; - } - - user_extra = nr_pages + 1; - user_lock_limit = sysctl_perf_counter_mlock >> (PAGE_SHIFT - 10); - - /* - * Increase the limit linearly with more CPUs: - */ - user_lock_limit *= num_online_cpus(); - - user_locked = atomic_long_read(&user->locked_vm) + user_extra; - - extra = 0; - if (user_locked > user_lock_limit) - extra = user_locked - user_lock_limit; - - lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur; - lock_limit >>= PAGE_SHIFT; - locked = vma->vm_mm->locked_vm + extra; - - if ((locked > lock_limit) && perf_paranoid_tracepoint_raw() && - !capable(CAP_IPC_LOCK)) { - ret = -EPERM; - goto unlock; - } - - WARN_ON(counter->data); - ret = perf_mmap_data_alloc(counter, nr_pages); - if (ret) - goto unlock; - - atomic_set(&counter->mmap_count, 1); - atomic_long_add(user_extra, &user->locked_vm); - vma->vm_mm->locked_vm += extra; - counter->data->nr_locked = extra; - if (vma->vm_flags & VM_WRITE) - counter->data->writable = 1; - -unlock: - mutex_unlock(&counter->mmap_mutex); - - vma->vm_flags |= VM_RESERVED; - vma->vm_ops = &perf_mmap_vmops; - - return ret; -} - -static int perf_fasync(int fd, struct file *filp, int on) -{ - struct inode *inode = filp->f_path.dentry->d_inode; - struct perf_counter *counter = filp->private_data; - int retval; - - mutex_lock(&inode->i_mutex); - retval = fasync_helper(fd, filp, on, &counter->fasync); - mutex_unlock(&inode->i_mutex); - - if (retval < 0) - return retval; - - return 0; -} - -static const struct file_operations perf_fops = { - .release = perf_release, - .read = perf_read, - .poll = perf_poll, - .unlocked_ioctl = perf_ioctl, - .compat_ioctl = perf_ioctl, - .mmap = perf_mmap, - .fasync = perf_fasync, -}; - -/* - * Perf counter wakeup - * - * If there's data, ensure we set the poll() state and publish everything - * to user-space before waking everybody up. - */ - -void perf_counter_wakeup(struct perf_counter *counter) -{ - wake_up_all(&counter->waitq); - - if (counter->pending_kill) { - kill_fasync(&counter->fasync, SIGIO, counter->pending_kill); - counter->pending_kill = 0; - } -} - -/* - * Pending wakeups - * - * Handle the case where we need to wakeup up from NMI (or rq->lock) context. - * - * The NMI bit means we cannot possibly take locks. Therefore, maintain a - * single linked list and use cmpxchg() to add entries lockless. - */ - -static void perf_pending_counter(struct perf_pending_entry *entry) -{ - struct perf_counter *counter = container_of(entry, - struct perf_counter, pending); - - if (counter->pending_disable) { - counter->pending_disable = 0; - __perf_counter_disable(counter); - } - - if (counter->pending_wakeup) { - counter->pending_wakeup = 0; - perf_counter_wakeup(counter); - } -} - -#define PENDING_TAIL ((struct perf_pending_entry *)-1UL) - -static DEFINE_PER_CPU(struct perf_pending_entry *, perf_pending_head) = { - PENDING_TAIL, -}; - -static void perf_pending_queue(struct perf_pending_entry *entry, - void (*func)(struct perf_pending_entry *)) -{ - struct perf_pending_entry **head; - - if (cmpxchg(&entry->next, NULL, PENDING_TAIL) != NULL) - return; - - entry->func = func; - - head = &get_cpu_var(perf_pending_head); - - do { - entry->next = *head; - } while (cmpxchg(head, entry->next, entry) != entry->next); - - set_perf_counter_pending(); - - put_cpu_var(perf_pending_head); -} - -static int __perf_pending_run(void) -{ - struct perf_pending_entry *list; - int nr = 0; - - list = xchg(&__get_cpu_var(perf_pending_head), PENDING_TAIL); - while (list != PENDING_TAIL) { - void (*func)(struct perf_pending_entry *); - struct perf_pending_entry *entry = list; - - list = list->next; - - func = entry->func; - entry->next = NULL; - /* - * Ensure we observe the unqueue before we issue the wakeup, - * so that we won't be waiting forever. - * -- see perf_not_pending(). - */ - smp_wmb(); - - func(entry); - nr++; - } - - return nr; -} - -static inline int perf_not_pending(struct perf_counter *counter) -{ - /* - * If we flush on whatever cpu we run, there is a chance we don't - * need to wait. - */ - get_cpu(); - __perf_pending_run(); - put_cpu(); - - /* - * Ensure we see the proper queue state before going to sleep - * so that we do not miss the wakeup. -- see perf_pending_handle() - */ - smp_rmb(); - return counter->pending.next == NULL; -} - -static void perf_pending_sync(struct perf_counter *counter) -{ - wait_event(counter->waitq, perf_not_pending(counter)); -} - -void perf_counter_do_pending(void) -{ - __perf_pending_run(); -} - -/* - * Callchain support -- arch specific - */ - -__weak struct perf_callchain_entry *perf_callchain(struct pt_regs *regs) -{ - return NULL; -} - -/* - * Output - */ -static bool perf_output_space(struct perf_mmap_data *data, unsigned long tail, - unsigned long offset, unsigned long head) -{ - unsigned long mask; - - if (!data->writable) - return true; - - mask = (data->nr_pages << PAGE_SHIFT) - 1; - - offset = (offset - tail) & mask; - head = (head - tail) & mask; - - if ((int)(head - offset) < 0) - return false; - - return true; -} - -static void perf_output_wakeup(struct perf_output_handle *handle) -{ - atomic_set(&handle->data->poll, POLL_IN); - - if (handle->nmi) { - handle->counter->pending_wakeup = 1; - perf_pending_queue(&handle->counter->pending, - perf_pending_counter); - } else - perf_counter_wakeup(handle->counter); -} - -/* - * Curious locking construct. - * - * We need to ensure a later event doesn't publish a head when a former - * event isn't done writing. However since we need to deal with NMIs we - * cannot fully serialize things. - * - * What we do is serialize between CPUs so we only have to deal with NMI - * nesting on a single CPU. - * - * We only publish the head (and generate a wakeup) when the outer-most - * event completes. - */ -static void perf_output_lock(struct perf_output_handle *handle) -{ - struct perf_mmap_data *data = handle->data; - int cpu; - - handle->locked = 0; - - local_irq_save(handle->flags); - cpu = smp_processor_id(); - - if (in_nmi() && atomic_read(&data->lock) == cpu) - return; - - while (atomic_cmpxchg(&data->lock, -1, cpu) != -1) - cpu_relax(); - - handle->locked = 1; -} - -static void perf_output_unlock(struct perf_output_handle *handle) -{ - struct perf_mmap_data *data = handle->data; - unsigned long head; - int cpu; - - data->done_head = data->head; - - if (!handle->locked) - goto out; - -again: - /* - * The xchg implies a full barrier that ensures all writes are done - * before we publish the new head, matched by a rmb() in userspace when - * reading this position. - */ - while ((head = atomic_long_xchg(&data->done_head, 0))) - data->user_page->data_head = head; - - /* - * NMI can happen here, which means we can miss a done_head update. - */ - - cpu = atomic_xchg(&data->lock, -1); - WARN_ON_ONCE(cpu != smp_processor_id()); - - /* - * Therefore we have to validate we did not indeed do so. - */ - if (unlikely(atomic_long_read(&data->done_head))) { - /* - * Since we had it locked, we can lock it again. - */ - while (atomic_cmpxchg(&data->lock, -1, cpu) != -1) - cpu_relax(); - - goto again; - } - - if (atomic_xchg(&data->wakeup, 0)) - perf_output_wakeup(handle); -out: - local_irq_restore(handle->flags); -} - -void perf_output_copy(struct perf_output_handle *handle, - const void *buf, unsigned int len) -{ - unsigned int pages_mask; - unsigned int offset; - unsigned int size; - void **pages; - - offset = handle->offset; - pages_mask = handle->data->nr_pages - 1; - pages = handle->data->data_pages; - - do { - unsigned int page_offset; - int nr; - - nr = (offset >> PAGE_SHIFT) & pages_mask; - page_offset = offset & (PAGE_SIZE - 1); - size = min_t(unsigned int, PAGE_SIZE - page_offset, len); - - memcpy(pages[nr] + page_offset, buf, size); - - len -= size; - buf += size; - offset += size; - } while (len); - - handle->offset = offset; - - /* - * Check we didn't copy past our reservation window, taking the - * possible unsigned int wrap into account. - */ - WARN_ON_ONCE(((long)(handle->head - handle->offset)) < 0); -} - -int perf_output_begin(struct perf_output_handle *handle, - struct perf_counter *counter, unsigned int size, - int nmi, int sample) -{ - struct perf_counter *output_counter; - struct perf_mmap_data *data; - unsigned long tail, offset, head; - int have_lost; - struct { - struct perf_event_header header; - u64 id; - u64 lost; - } lost_event; - - rcu_read_lock(); - /* - * For inherited counters we send all the output towards the parent. - */ - if (counter->parent) - counter = counter->parent; - - output_counter = rcu_dereference(counter->output); - if (output_counter) - counter = output_counter; - - data = rcu_dereference(counter->data); - if (!data) - goto out; - - handle->data = data; - handle->counter = counter; - handle->nmi = nmi; - handle->sample = sample; - - if (!data->nr_pages) - goto fail; - - have_lost = atomic_read(&data->lost); - if (have_lost) - size += sizeof(lost_event); - - perf_output_lock(handle); - - do { - /* - * Userspace could choose to issue a mb() before updating the - * tail pointer. So that all reads will be completed before the - * write is issued. - */ - tail = ACCESS_ONCE(data->user_page->data_tail); - smp_rmb(); - offset = head = atomic_long_read(&data->head); - head += size; - if (unlikely(!perf_output_space(data, tail, offset, head))) - goto fail; - } while (atomic_long_cmpxchg(&data->head, offset, head) != offset); - - handle->offset = offset; - handle->head = head; - - if (head - tail > data->watermark) - atomic_set(&data->wakeup, 1); - - if (have_lost) { - lost_event.header.type = PERF_EVENT_LOST; - lost_event.header.misc = 0; - lost_event.header.size = sizeof(lost_event); - lost_event.id = counter->id; - lost_event.lost = atomic_xchg(&data->lost, 0); - - perf_output_put(handle, lost_event); - } - - return 0; - -fail: - atomic_inc(&data->lost); - perf_output_unlock(handle); -out: - rcu_read_unlock(); - - return -ENOSPC; -} - -void perf_output_end(struct perf_output_handle *handle) -{ - struct perf_counter *counter = handle->counter; - struct perf_mmap_data *data = handle->data; - - int wakeup_events = counter->attr.wakeup_events; - - if (handle->sample && wakeup_events) { - int events = atomic_inc_return(&data->events); - if (events >= wakeup_events) { - atomic_sub(wakeup_events, &data->events); - atomic_set(&data->wakeup, 1); - } - } - - perf_output_unlock(handle); - rcu_read_unlock(); -} - -static u32 perf_counter_pid(struct perf_counter *counter, struct task_struct *p) -{ - /* - * only top level counters have the pid namespace they were created in - */ - if (counter->parent) - counter = counter->parent; - - return task_tgid_nr_ns(p, counter->ns); -} - -static u32 perf_counter_tid(struct perf_counter *counter, struct task_struct *p) -{ - /* - * only top level counters have the pid namespace they were created in - */ - if (counter->parent) - counter = counter->parent; - - return task_pid_nr_ns(p, counter->ns); -} - -static void perf_output_read_one(struct perf_output_handle *handle, - struct perf_counter *counter) -{ - u64 read_format = counter->attr.read_format; - u64 values[4]; - int n = 0; - - values[n++] = atomic64_read(&counter->count); - if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) { - values[n++] = counter->total_time_enabled + - atomic64_read(&counter->child_total_time_enabled); - } - if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) { - values[n++] = counter->total_time_running + - atomic64_read(&counter->child_total_time_running); - } - if (read_format & PERF_FORMAT_ID) - values[n++] = primary_counter_id(counter); - - perf_output_copy(handle, values, n * sizeof(u64)); -} - -/* - * XXX PERF_FORMAT_GROUP vs inherited counters seems difficult. - */ -static void perf_output_read_group(struct perf_output_handle *handle, - struct perf_counter *counter) -{ - struct perf_counter *leader = counter->group_leader, *sub; - u64 read_format = counter->attr.read_format; - u64 values[5]; - int n = 0; - - values[n++] = 1 + leader->nr_siblings; - - if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) - values[n++] = leader->total_time_enabled; - - if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) - values[n++] = leader->total_time_running; - - if (leader != counter) - leader->pmu->read(leader); - - values[n++] = atomic64_read(&leader->count); - if (read_format & PERF_FORMAT_ID) - values[n++] = primary_counter_id(leader); - - perf_output_copy(handle, values, n * sizeof(u64)); - - list_for_each_entry(sub, &leader->sibling_list, group_entry) { - n = 0; - - if (sub != counter) - sub->pmu->read(sub); - - values[n++] = atomic64_read(&sub->count); - if (read_format & PERF_FORMAT_ID) - values[n++] = primary_counter_id(sub); - - perf_output_copy(handle, values, n * sizeof(u64)); - } -} - -static void perf_output_read(struct perf_output_handle *handle, - struct perf_counter *counter) -{ - if (counter->attr.read_format & PERF_FORMAT_GROUP) - perf_output_read_group(handle, counter); - else - perf_output_read_one(handle, counter); -} - -void perf_output_sample(struct perf_output_handle *handle, - struct perf_event_header *header, - struct perf_sample_data *data, - struct perf_counter *counter) -{ - u64 sample_type = data->type; - - perf_output_put(handle, *header); - - if (sample_type & PERF_SAMPLE_IP) - perf_output_put(handle, data->ip); - - if (sample_type & PERF_SAMPLE_TID) - perf_output_put(handle, data->tid_entry); - - if (sample_type & PERF_SAMPLE_TIME) - perf_output_put(handle, data->time); - - if (sample_type & PERF_SAMPLE_ADDR) - perf_output_put(handle, data->addr); - - if (sample_type & PERF_SAMPLE_ID) - perf_output_put(handle, data->id); - - if (sample_type & PERF_SAMPLE_STREAM_ID) - perf_output_put(handle, data->stream_id); - - if (sample_type & PERF_SAMPLE_CPU) - perf_output_put(handle, data->cpu_entry); - - if (sample_type & PERF_SAMPLE_PERIOD) - perf_output_put(handle, data->period); - - if (sample_type & PERF_SAMPLE_READ) - perf_output_read(handle, counter); - - if (sample_type & PERF_SAMPLE_CALLCHAIN) { - if (data->callchain) { - int size = 1; - - if (data->callchain) - size += data->callchain->nr; - - size *= sizeof(u64); - - perf_output_copy(handle, data->callchain, size); - } else { - u64 nr = 0; - perf_output_put(handle, nr); - } - } - - if (sample_type & PERF_SAMPLE_RAW) { - if (data->raw) { - perf_output_put(handle, data->raw->size); - perf_output_copy(handle, data->raw->data, - data->raw->size); - } else { - struct { - u32 size; - u32 data; - } raw = { - .size = sizeof(u32), - .data = 0, - }; - perf_output_put(handle, raw); - } - } -} - -void perf_prepare_sample(struct perf_event_header *header, - struct perf_sample_data *data, - struct perf_counter *counter, - struct pt_regs *regs) -{ - u64 sample_type = counter->attr.sample_type; - - data->type = sample_type; - - header->type = PERF_EVENT_SAMPLE; - header->size = sizeof(*header); - - header->misc = 0; - header->misc |= perf_misc_flags(regs); - - if (sample_type & PERF_SAMPLE_IP) { - data->ip = perf_instruction_pointer(regs); - - header->size += sizeof(data->ip); - } - - if (sample_type & PERF_SAMPLE_TID) { - /* namespace issues */ - data->tid_entry.pid = perf_counter_pid(counter, current); - data->tid_entry.tid = perf_counter_tid(counter, current); - - header->size += sizeof(data->tid_entry); - } - - if (sample_type & PERF_SAMPLE_TIME) { - data->time = perf_clock(); - - header->size += sizeof(data->time); - } - - if (sample_type & PERF_SAMPLE_ADDR) - header->size += sizeof(data->addr); - - if (sample_type & PERF_SAMPLE_ID) { - data->id = primary_counter_id(counter); - - header->size += sizeof(data->id); - } - - if (sample_type & PERF_SAMPLE_STREAM_ID) { - data->stream_id = counter->id; - - header->size += sizeof(data->stream_id); - } - - if (sample_type & PERF_SAMPLE_CPU) { - data->cpu_entry.cpu = raw_smp_processor_id(); - data->cpu_entry.reserved = 0; - - header->size += sizeof(data->cpu_entry); - } - - if (sample_type & PERF_SAMPLE_PERIOD) - header->size += sizeof(data->period); - - if (sample_type & PERF_SAMPLE_READ) - header->size += perf_counter_read_size(counter); - - if (sample_type & PERF_SAMPLE_CALLCHAIN) { - int size = 1; - - data->callchain = perf_callchain(regs); - - if (data->callchain) - size += data->callchain->nr; - - header->size += size * sizeof(u64); - } - - if (sample_type & PERF_SAMPLE_RAW) { - int size = sizeof(u32); - - if (data->raw) - size += data->raw->size; - else - size += sizeof(u32); - - WARN_ON_ONCE(size & (sizeof(u64)-1)); - header->size += size; - } -} - -static void perf_counter_output(struct perf_counter *counter, int nmi, - struct perf_sample_data *data, - struct pt_regs *regs) -{ - struct perf_output_handle handle; - struct perf_event_header header; - - perf_prepare_sample(&header, data, counter, regs); - - if (perf_output_begin(&handle, counter, header.size, nmi, 1)) - return; - - perf_output_sample(&handle, &header, data, counter); - - perf_output_end(&handle); -} - -/* - * read event - */ - -struct perf_read_event { - struct perf_event_header header; - - u32 pid; - u32 tid; -}; - -static void -perf_counter_read_event(struct perf_counter *counter, - struct task_struct *task) -{ - struct perf_output_handle handle; - struct perf_read_event read_event = { - .header = { - .type = PERF_EVENT_READ, - .misc = 0, - .size = sizeof(read_event) + perf_counter_read_size(counter), - }, - .pid = perf_counter_pid(counter, task), - .tid = perf_counter_tid(counter, task), - }; - int ret; - - ret = perf_output_begin(&handle, counter, read_event.header.size, 0, 0); - if (ret) - return; - - perf_output_put(&handle, read_event); - perf_output_read(&handle, counter); - - perf_output_end(&handle); -} - -/* - * task tracking -- fork/exit - * - * enabled by: attr.comm | attr.mmap | attr.task - */ - -struct perf_task_event { - struct task_struct *task; - struct perf_counter_context *task_ctx; - - struct { - struct perf_event_header header; - - u32 pid; - u32 ppid; - u32 tid; - u32 ptid; - u64 time; - } event; -}; - -static void perf_counter_task_output(struct perf_counter *counter, - struct perf_task_event *task_event) -{ - struct perf_output_handle handle; - int size; - struct task_struct *task = task_event->task; - int ret; - - size = task_event->event.header.size; - ret = perf_output_begin(&handle, counter, size, 0, 0); - - if (ret) - return; - - task_event->event.pid = perf_counter_pid(counter, task); - task_event->event.ppid = perf_counter_pid(counter, current); - - task_event->event.tid = perf_counter_tid(counter, task); - task_event->event.ptid = perf_counter_tid(counter, current); - - task_event->event.time = perf_clock(); - - perf_output_put(&handle, task_event->event); - - perf_output_end(&handle); -} - -static int perf_counter_task_match(struct perf_counter *counter) -{ - if (counter->attr.comm || counter->attr.mmap || counter->attr.task) - return 1; - - return 0; -} - -static void perf_counter_task_ctx(struct perf_counter_context *ctx, - struct perf_task_event *task_event) -{ - struct perf_counter *counter; - - if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list)) - return; - - rcu_read_lock(); - list_for_each_entry_rcu(counter, &ctx->event_list, event_entry) { - if (perf_counter_task_match(counter)) - perf_counter_task_output(counter, task_event); - } - rcu_read_unlock(); -} - -static void perf_counter_task_event(struct perf_task_event *task_event) -{ - struct perf_cpu_context *cpuctx; - struct perf_counter_context *ctx = task_event->task_ctx; - - cpuctx = &get_cpu_var(perf_cpu_context); - perf_counter_task_ctx(&cpuctx->ctx, task_event); - put_cpu_var(perf_cpu_context); - - rcu_read_lock(); - if (!ctx) - ctx = rcu_dereference(task_event->task->perf_counter_ctxp); - if (ctx) - perf_counter_task_ctx(ctx, task_event); - rcu_read_unlock(); -} - -static void perf_counter_task(struct task_struct *task, - struct perf_counter_context *task_ctx, - int new) -{ - struct perf_task_event task_event; - - if (!atomic_read(&nr_comm_counters) && - !atomic_read(&nr_mmap_counters) && - !atomic_read(&nr_task_counters)) - return; - - task_event = (struct perf_task_event){ - .task = task, - .task_ctx = task_ctx, - .event = { - .header = { - .type = new ? PERF_EVENT_FORK : PERF_EVENT_EXIT, - .misc = 0, - .size = sizeof(task_event.event), - }, - /* .pid */ - /* .ppid */ - /* .tid */ - /* .ptid */ - }, - }; - - perf_counter_task_event(&task_event); -} - -void perf_counter_fork(struct task_struct *task) -{ - perf_counter_task(task, NULL, 1); -} - -/* - * comm tracking - */ - -struct perf_comm_event { - struct task_struct *task; - char *comm; - int comm_size; - - struct { - struct perf_event_header header; - - u32 pid; - u32 tid; - } event; -}; - -static void perf_counter_comm_output(struct perf_counter *counter, - struct perf_comm_event *comm_event) -{ - struct perf_output_handle handle; - int size = comm_event->event.header.size; - int ret = perf_output_begin(&handle, counter, size, 0, 0); - - if (ret) - return; - - comm_event->event.pid = perf_counter_pid(counter, comm_event->task); - comm_event->event.tid = perf_counter_tid(counter, comm_event->task); - - perf_output_put(&handle, comm_event->event); - perf_output_copy(&handle, comm_event->comm, - comm_event->comm_size); - perf_output_end(&handle); -} - -static int perf_counter_comm_match(struct perf_counter *counter) -{ - if (counter->attr.comm) - return 1; - - return 0; -} - -static void perf_counter_comm_ctx(struct perf_counter_context *ctx, - struct perf_comm_event *comm_event) -{ - struct perf_counter *counter; - - if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list)) - return; - - rcu_read_lock(); - list_for_each_entry_rcu(counter, &ctx->event_list, event_entry) { - if (perf_counter_comm_match(counter)) - perf_counter_comm_output(counter, comm_event); - } - rcu_read_unlock(); -} - -static void perf_counter_comm_event(struct perf_comm_event *comm_event) -{ - struct perf_cpu_context *cpuctx; - struct perf_counter_context *ctx; - unsigned int size; - char comm[TASK_COMM_LEN]; - - memset(comm, 0, sizeof(comm)); - strncpy(comm, comm_event->task->comm, sizeof(comm)); - size = ALIGN(strlen(comm)+1, sizeof(u64)); - - comm_event->comm = comm; - comm_event->comm_size = size; - - comm_event->event.header.size = sizeof(comm_event->event) + size; - - cpuctx = &get_cpu_var(perf_cpu_context); - perf_counter_comm_ctx(&cpuctx->ctx, comm_event); - put_cpu_var(perf_cpu_context); - - rcu_read_lock(); - /* - * doesn't really matter which of the child contexts the - * events ends up in. - */ - ctx = rcu_dereference(current->perf_counter_ctxp); - if (ctx) - perf_counter_comm_ctx(ctx, comm_event); - rcu_read_unlock(); -} - -void perf_counter_comm(struct task_struct *task) -{ - struct perf_comm_event comm_event; - - if (task->perf_counter_ctxp) - perf_counter_enable_on_exec(task); - - if (!atomic_read(&nr_comm_counters)) - return; - - comm_event = (struct perf_comm_event){ - .task = task, - /* .comm */ - /* .comm_size */ - .event = { - .header = { - .type = PERF_EVENT_COMM, - .misc = 0, - /* .size */ - }, - /* .pid */ - /* .tid */ - }, - }; - - perf_counter_comm_event(&comm_event); -} - -/* - * mmap tracking - */ - -struct perf_mmap_event { - struct vm_area_struct *vma; - - const char *file_name; - int file_size; - - struct { - struct perf_event_header header; - - u32 pid; - u32 tid; - u64 start; - u64 len; - u64 pgoff; - } event; -}; - -static void perf_counter_mmap_output(struct perf_counter *counter, - struct perf_mmap_event *mmap_event) -{ - struct perf_output_handle handle; - int size = mmap_event->event.header.size; - int ret = perf_output_begin(&handle, counter, size, 0, 0); - - if (ret) - return; - - mmap_event->event.pid = perf_counter_pid(counter, current); - mmap_event->event.tid = perf_counter_tid(counter, current); - - perf_output_put(&handle, mmap_event->event); - perf_output_copy(&handle, mmap_event->file_name, - mmap_event->file_size); - perf_output_end(&handle); -} - -static int perf_counter_mmap_match(struct perf_counter *counter, - struct perf_mmap_event *mmap_event) -{ - if (counter->attr.mmap) - return 1; - - return 0; -} - -static void perf_counter_mmap_ctx(struct perf_counter_context *ctx, - struct perf_mmap_event *mmap_event) -{ - struct perf_counter *counter; - - if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list)) - return; - - rcu_read_lock(); - list_for_each_entry_rcu(counter, &ctx->event_list, event_entry) { - if (perf_counter_mmap_match(counter, mmap_event)) - perf_counter_mmap_output(counter, mmap_event); - } - rcu_read_unlock(); -} - -static void perf_counter_mmap_event(struct perf_mmap_event *mmap_event) -{ - struct perf_cpu_context *cpuctx; - struct perf_counter_context *ctx; - struct vm_area_struct *vma = mmap_event->vma; - struct file *file = vma->vm_file; - unsigned int size; - char tmp[16]; - char *buf = NULL; - const char *name; - - memset(tmp, 0, sizeof(tmp)); - - if (file) { - /* - * d_path works from the end of the buffer backwards, so we - * need to add enough zero bytes after the string to handle - * the 64bit alignment we do later. - */ - buf = kzalloc(PATH_MAX + sizeof(u64), GFP_KERNEL); - if (!buf) { - name = strncpy(tmp, "//enomem", sizeof(tmp)); - goto got_name; - } - name = d_path(&file->f_path, buf, PATH_MAX); - if (IS_ERR(name)) { - name = strncpy(tmp, "//toolong", sizeof(tmp)); - goto got_name; - } - } else { - if (arch_vma_name(mmap_event->vma)) { - name = strncpy(tmp, arch_vma_name(mmap_event->vma), - sizeof(tmp)); - goto got_name; - } - - if (!vma->vm_mm) { - name = strncpy(tmp, "[vdso]", sizeof(tmp)); - goto got_name; - } - - name = strncpy(tmp, "//anon", sizeof(tmp)); - goto got_name; - } - -got_name: - size = ALIGN(strlen(name)+1, sizeof(u64)); - - mmap_event->file_name = name; - mmap_event->file_size = size; - - mmap_event->event.header.size = sizeof(mmap_event->event) + size; - - cpuctx = &get_cpu_var(perf_cpu_context); - perf_counter_mmap_ctx(&cpuctx->ctx, mmap_event); - put_cpu_var(perf_cpu_context); - - rcu_read_lock(); - /* - * doesn't really matter which of the child contexts the - * events ends up in. - */ - ctx = rcu_dereference(current->perf_counter_ctxp); - if (ctx) - perf_counter_mmap_ctx(ctx, mmap_event); - rcu_read_unlock(); - - kfree(buf); -} - -void __perf_counter_mmap(struct vm_area_struct *vma) -{ - struct perf_mmap_event mmap_event; - - if (!atomic_read(&nr_mmap_counters)) - return; - - mmap_event = (struct perf_mmap_event){ - .vma = vma, - /* .file_name */ - /* .file_size */ - .event = { - .header = { - .type = PERF_EVENT_MMAP, - .misc = 0, - /* .size */ - }, - /* .pid */ - /* .tid */ - .start = vma->vm_start, - .len = vma->vm_end - vma->vm_start, - .pgoff = vma->vm_pgoff, - }, - }; - - perf_counter_mmap_event(&mmap_event); -} - -/* - * IRQ throttle logging - */ - -static void perf_log_throttle(struct perf_counter *counter, int enable) -{ - struct perf_output_handle handle; - int ret; - - struct { - struct perf_event_header header; - u64 time; - u64 id; - u64 stream_id; - } throttle_event = { - .header = { - .type = PERF_EVENT_THROTTLE, - .misc = 0, - .size = sizeof(throttle_event), - }, - .time = perf_clock(), - .id = primary_counter_id(counter), - .stream_id = counter->id, - }; - - if (enable) - throttle_event.header.type = PERF_EVENT_UNTHROTTLE; - - ret = perf_output_begin(&handle, counter, sizeof(throttle_event), 1, 0); - if (ret) - return; - - perf_output_put(&handle, throttle_event); - perf_output_end(&handle); -} - -/* - * Generic counter overflow handling, sampling. - */ - -static int __perf_counter_overflow(struct perf_counter *counter, int nmi, - int throttle, struct perf_sample_data *data, - struct pt_regs *regs) -{ - int events = atomic_read(&counter->event_limit); - struct hw_perf_counter *hwc = &counter->hw; - int ret = 0; - - throttle = (throttle && counter->pmu->unthrottle != NULL); - - if (!throttle) { - hwc->interrupts++; - } else { - if (hwc->interrupts != MAX_INTERRUPTS) { - hwc->interrupts++; - if (HZ * hwc->interrupts > - (u64)sysctl_perf_counter_sample_rate) { - hwc->interrupts = MAX_INTERRUPTS; - perf_log_throttle(counter, 0); - ret = 1; - } - } else { - /* - * Keep re-disabling counters even though on the previous - * pass we disabled it - just in case we raced with a - * sched-in and the counter got enabled again: - */ - ret = 1; - } - } - - if (counter->attr.freq) { - u64 now = perf_clock(); - s64 delta = now - hwc->freq_stamp; - - hwc->freq_stamp = now; - - if (delta > 0 && delta < TICK_NSEC) - perf_adjust_period(counter, NSEC_PER_SEC / (int)delta); - } - - /* - * XXX event_limit might not quite work as expected on inherited - * counters - */ - - counter->pending_kill = POLL_IN; - if (events && atomic_dec_and_test(&counter->event_limit)) { - ret = 1; - counter->pending_kill = POLL_HUP; - if (nmi) { - counter->pending_disable = 1; - perf_pending_queue(&counter->pending, - perf_pending_counter); - } else - perf_counter_disable(counter); - } - - perf_counter_output(counter, nmi, data, regs); - return ret; -} - -int perf_counter_overflow(struct perf_counter *counter, int nmi, - struct perf_sample_data *data, - struct pt_regs *regs) -{ - return __perf_counter_overflow(counter, nmi, 1, data, regs); -} - -/* - * Generic software counter infrastructure - */ - -/* - * We directly increment counter->count and keep a second value in - * counter->hw.period_left to count intervals. This period counter - * is kept in the range [-sample_period, 0] so that we can use the - * sign as trigger. - */ - -static u64 perf_swcounter_set_period(struct perf_counter *counter) -{ - struct hw_perf_counter *hwc = &counter->hw; - u64 period = hwc->last_period; - u64 nr, offset; - s64 old, val; - - hwc->last_period = hwc->sample_period; - -again: - old = val = atomic64_read(&hwc->period_left); - if (val < 0) - return 0; - - nr = div64_u64(period + val, period); - offset = nr * period; - val -= offset; - if (atomic64_cmpxchg(&hwc->period_left, old, val) != old) - goto again; - - return nr; -} - -static void perf_swcounter_overflow(struct perf_counter *counter, - int nmi, struct perf_sample_data *data, - struct pt_regs *regs) -{ - struct hw_perf_counter *hwc = &counter->hw; - int throttle = 0; - u64 overflow; - - data->period = counter->hw.last_period; - overflow = perf_swcounter_set_period(counter); - - if (hwc->interrupts == MAX_INTERRUPTS) - return; - - for (; overflow; overflow--) { - if (__perf_counter_overflow(counter, nmi, throttle, - data, regs)) { - /* - * We inhibit the overflow from happening when - * hwc->interrupts == MAX_INTERRUPTS. - */ - break; - } - throttle = 1; - } -} - -static void perf_swcounter_unthrottle(struct perf_counter *counter) -{ - /* - * Nothing to do, we already reset hwc->interrupts. - */ -} - -static void perf_swcounter_add(struct perf_counter *counter, u64 nr, - int nmi, struct perf_sample_data *data, - struct pt_regs *regs) -{ - struct hw_perf_counter *hwc = &counter->hw; - - atomic64_add(nr, &counter->count); - - if (!hwc->sample_period) - return; - - if (!regs) - return; - - if (!atomic64_add_negative(nr, &hwc->period_left)) - perf_swcounter_overflow(counter, nmi, data, regs); -} - -static int perf_swcounter_is_counting(struct perf_counter *counter) -{ - /* - * The counter is active, we're good! - */ - if (counter->state == PERF_COUNTER_STATE_ACTIVE) - return 1; - - /* - * The counter is off/error, not counting. - */ - if (counter->state != PERF_COUNTER_STATE_INACTIVE) - return 0; - - /* - * The counter is inactive, if the context is active - * we're part of a group that didn't make it on the 'pmu', - * not counting. - */ - if (counter->ctx->is_active) - return 0; - - /* - * We're inactive and the context is too, this means the - * task is scheduled out, we're counting events that happen - * to us, like migration events. - */ - return 1; -} - -static int perf_swcounter_match(struct perf_counter *counter, - enum perf_type_id type, - u32 event_id, struct pt_regs *regs) -{ - if (!perf_swcounter_is_counting(counter)) - return 0; - - if (counter->attr.type != type) - return 0; - if (counter->attr.config != event_id) - return 0; - - if (regs) { - if (counter->attr.exclude_user && user_mode(regs)) - return 0; - - if (counter->attr.exclude_kernel && !user_mode(regs)) - return 0; - } - - return 1; -} - -static void perf_swcounter_ctx_event(struct perf_counter_context *ctx, - enum perf_type_id type, - u32 event_id, u64 nr, int nmi, - struct perf_sample_data *data, - struct pt_regs *regs) -{ - struct perf_counter *counter; - - if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list)) - return; - - rcu_read_lock(); - list_for_each_entry_rcu(counter, &ctx->event_list, event_entry) { - if (perf_swcounter_match(counter, type, event_id, regs)) - perf_swcounter_add(counter, nr, nmi, data, regs); - } - rcu_read_unlock(); -} - -static int *perf_swcounter_recursion_context(struct perf_cpu_context *cpuctx) -{ - if (in_nmi()) - return &cpuctx->recursion[3]; - - if (in_irq()) - return &cpuctx->recursion[2]; - - if (in_softirq()) - return &cpuctx->recursion[1]; - - return &cpuctx->recursion[0]; -} - -static void do_perf_swcounter_event(enum perf_type_id type, u32 event, - u64 nr, int nmi, - struct perf_sample_data *data, - struct pt_regs *regs) -{ - struct perf_cpu_context *cpuctx = &get_cpu_var(perf_cpu_context); - int *recursion = perf_swcounter_recursion_context(cpuctx); - struct perf_counter_context *ctx; - - if (*recursion) - goto out; - - (*recursion)++; - barrier(); - - perf_swcounter_ctx_event(&cpuctx->ctx, type, event, - nr, nmi, data, regs); - rcu_read_lock(); - /* - * doesn't really matter which of the child contexts the - * events ends up in. - */ - ctx = rcu_dereference(current->perf_counter_ctxp); - if (ctx) - perf_swcounter_ctx_event(ctx, type, event, nr, nmi, data, regs); - rcu_read_unlock(); - - barrier(); - (*recursion)--; - -out: - put_cpu_var(perf_cpu_context); -} - -void __perf_swcounter_event(u32 event, u64 nr, int nmi, - struct pt_regs *regs, u64 addr) -{ - struct perf_sample_data data = { - .addr = addr, - }; - - do_perf_swcounter_event(PERF_TYPE_SOFTWARE, event, nr, nmi, - &data, regs); -} - -static void perf_swcounter_read(struct perf_counter *counter) -{ -} - -static int perf_swcounter_enable(struct perf_counter *counter) -{ - struct hw_perf_counter *hwc = &counter->hw; - - if (hwc->sample_period) { - hwc->last_period = hwc->sample_period; - perf_swcounter_set_period(counter); - } - return 0; -} - -static void perf_swcounter_disable(struct perf_counter *counter) -{ -} - -static const struct pmu perf_ops_generic = { - .enable = perf_swcounter_enable, - .disable = perf_swcounter_disable, - .read = perf_swcounter_read, - .unthrottle = perf_swcounter_unthrottle, -}; - -/* - * hrtimer based swcounter callback - */ - -static enum hrtimer_restart perf_swcounter_hrtimer(struct hrtimer *hrtimer) -{ - enum hrtimer_restart ret = HRTIMER_RESTART; - struct perf_sample_data data; - struct pt_regs *regs; - struct perf_counter *counter; - u64 period; - - counter = container_of(hrtimer, struct perf_counter, hw.hrtimer); - counter->pmu->read(counter); - - data.addr = 0; - regs = get_irq_regs(); - /* - * In case we exclude kernel IPs or are somehow not in interrupt - * context, provide the next best thing, the user IP. - */ - if ((counter->attr.exclude_kernel || !regs) && - !counter->attr.exclude_user) - regs = task_pt_regs(current); - - if (regs) { - if (perf_counter_overflow(counter, 0, &data, regs)) - ret = HRTIMER_NORESTART; - } - - period = max_t(u64, 10000, counter->hw.sample_period); - hrtimer_forward_now(hrtimer, ns_to_ktime(period)); - - return ret; -} - -/* - * Software counter: cpu wall time clock - */ - -static void cpu_clock_perf_counter_update(struct perf_counter *counter) -{ - int cpu = raw_smp_processor_id(); - s64 prev; - u64 now; - - now = cpu_clock(cpu); - prev = atomic64_read(&counter->hw.prev_count); - atomic64_set(&counter->hw.prev_count, now); - atomic64_add(now - prev, &counter->count); -} - -static int cpu_clock_perf_counter_enable(struct perf_counter *counter) -{ - struct hw_perf_counter *hwc = &counter->hw; - int cpu = raw_smp_processor_id(); - - atomic64_set(&hwc->prev_count, cpu_clock(cpu)); - hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); - hwc->hrtimer.function = perf_swcounter_hrtimer; - if (hwc->sample_period) { - u64 period = max_t(u64, 10000, hwc->sample_period); - __hrtimer_start_range_ns(&hwc->hrtimer, - ns_to_ktime(period), 0, - HRTIMER_MODE_REL, 0); - } - - return 0; -} - -static void cpu_clock_perf_counter_disable(struct perf_counter *counter) -{ - if (counter->hw.sample_period) - hrtimer_cancel(&counter->hw.hrtimer); - cpu_clock_perf_counter_update(counter); -} - -static void cpu_clock_perf_counter_read(struct perf_counter *counter) -{ - cpu_clock_perf_counter_update(counter); -} - -static const struct pmu perf_ops_cpu_clock = { - .enable = cpu_clock_perf_counter_enable, - .disable = cpu_clock_perf_counter_disable, - .read = cpu_clock_perf_counter_read, -}; - -/* - * Software counter: task time clock - */ - -static void task_clock_perf_counter_update(struct perf_counter *counter, u64 now) -{ - u64 prev; - s64 delta; - - prev = atomic64_xchg(&counter->hw.prev_count, now); - delta = now - prev; - atomic64_add(delta, &counter->count); -} - -static int task_clock_perf_counter_enable(struct perf_counter *counter) -{ - struct hw_perf_counter *hwc = &counter->hw; - u64 now; - - now = counter->ctx->time; - - atomic64_set(&hwc->prev_count, now); - hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); - hwc->hrtimer.function = perf_swcounter_hrtimer; - if (hwc->sample_period) { - u64 period = max_t(u64, 10000, hwc->sample_period); - __hrtimer_start_range_ns(&hwc->hrtimer, - ns_to_ktime(period), 0, - HRTIMER_MODE_REL, 0); - } - - return 0; -} - -static void task_clock_perf_counter_disable(struct perf_counter *counter) -{ - if (counter->hw.sample_period) - hrtimer_cancel(&counter->hw.hrtimer); - task_clock_perf_counter_update(counter, counter->ctx->time); - -} - -static void task_clock_perf_counter_read(struct perf_counter *counter) -{ - u64 time; - - if (!in_nmi()) { - update_context_time(counter->ctx); - time = counter->ctx->time; - } else { - u64 now = perf_clock(); - u64 delta = now - counter->ctx->timestamp; - time = counter->ctx->time + delta; - } - - task_clock_perf_counter_update(counter, time); -} - -static const struct pmu perf_ops_task_clock = { - .enable = task_clock_perf_counter_enable, - .disable = task_clock_perf_counter_disable, - .read = task_clock_perf_counter_read, -}; - -#ifdef CONFIG_EVENT_PROFILE -void perf_tpcounter_event(int event_id, u64 addr, u64 count, void *record, - int entry_size) -{ - struct perf_raw_record raw = { - .size = entry_size, - .data = record, - }; - - struct perf_sample_data data = { - .addr = addr, - .raw = &raw, - }; - - struct pt_regs *regs = get_irq_regs(); - - if (!regs) - regs = task_pt_regs(current); - - do_perf_swcounter_event(PERF_TYPE_TRACEPOINT, event_id, count, 1, - &data, regs); -} -EXPORT_SYMBOL_GPL(perf_tpcounter_event); - -extern int ftrace_profile_enable(int); -extern void ftrace_profile_disable(int); - -static void tp_perf_counter_destroy(struct perf_counter *counter) -{ - ftrace_profile_disable(counter->attr.config); -} - -static const struct pmu *tp_perf_counter_init(struct perf_counter *counter) -{ - /* - * Raw tracepoint data is a severe data leak, only allow root to - * have these. - */ - if ((counter->attr.sample_type & PERF_SAMPLE_RAW) && - perf_paranoid_tracepoint_raw() && - !capable(CAP_SYS_ADMIN)) - return ERR_PTR(-EPERM); - - if (ftrace_profile_enable(counter->attr.config)) - return NULL; - - counter->destroy = tp_perf_counter_destroy; - - return &perf_ops_generic; -} -#else -static const struct pmu *tp_perf_counter_init(struct perf_counter *counter) -{ - return NULL; -} -#endif - -atomic_t perf_swcounter_enabled[PERF_COUNT_SW_MAX]; - -static void sw_perf_counter_destroy(struct perf_counter *counter) -{ - u64 event_id = counter->attr.config; - - WARN_ON(counter->parent); - - atomic_dec(&perf_swcounter_enabled[event_id]); -} - -static const struct pmu *sw_perf_counter_init(struct perf_counter *counter) -{ - const struct pmu *pmu = NULL; - u64 event_id = counter->attr.config; - - /* - * Software counters (currently) can't in general distinguish - * between user, kernel and hypervisor events. - * However, context switches and cpu migrations are considered - * to be kernel events, and page faults are never hypervisor - * events. - */ - switch (event_id) { - case PERF_COUNT_SW_CPU_CLOCK: - pmu = &perf_ops_cpu_clock; - - break; - case PERF_COUNT_SW_TASK_CLOCK: - /* - * If the user instantiates this as a per-cpu counter, - * use the cpu_clock counter instead. - */ - if (counter->ctx->task) - pmu = &perf_ops_task_clock; - else - pmu = &perf_ops_cpu_clock; - - break; - case PERF_COUNT_SW_PAGE_FAULTS: - case PERF_COUNT_SW_PAGE_FAULTS_MIN: - case PERF_COUNT_SW_PAGE_FAULTS_MAJ: - case PERF_COUNT_SW_CONTEXT_SWITCHES: - case PERF_COUNT_SW_CPU_MIGRATIONS: - if (!counter->parent) { - atomic_inc(&perf_swcounter_enabled[event_id]); - counter->destroy = sw_perf_counter_destroy; - } - pmu = &perf_ops_generic; - break; - } - - return pmu; -} - -/* - * Allocate and initialize a counter structure - */ -static struct perf_counter * -perf_counter_alloc(struct perf_counter_attr *attr, - int cpu, - struct perf_counter_context *ctx, - struct perf_counter *group_leader, - struct perf_counter *parent_counter, - gfp_t gfpflags) -{ - const struct pmu *pmu; - struct perf_counter *counter; - struct hw_perf_counter *hwc; - long err; - - counter = kzalloc(sizeof(*counter), gfpflags); - if (!counter) - return ERR_PTR(-ENOMEM); - - /* - * Single counters are their own group leaders, with an - * empty sibling list: - */ - if (!group_leader) - group_leader = counter; - - mutex_init(&counter->child_mutex); - INIT_LIST_HEAD(&counter->child_list); - - INIT_LIST_HEAD(&counter->group_entry); - INIT_LIST_HEAD(&counter->event_entry); - INIT_LIST_HEAD(&counter->sibling_list); - init_waitqueue_head(&counter->waitq); - - mutex_init(&counter->mmap_mutex); - - counter->cpu = cpu; - counter->attr = *attr; - counter->group_leader = group_leader; - counter->pmu = NULL; - counter->ctx = ctx; - counter->oncpu = -1; - - counter->parent = parent_counter; - - counter->ns = get_pid_ns(current->nsproxy->pid_ns); - counter->id = atomic64_inc_return(&perf_counter_id); - - counter->state = PERF_COUNTER_STATE_INACTIVE; - - if (attr->disabled) - counter->state = PERF_COUNTER_STATE_OFF; - - pmu = NULL; - - hwc = &counter->hw; - hwc->sample_period = attr->sample_period; - if (attr->freq && attr->sample_freq) - hwc->sample_period = 1; - hwc->last_period = hwc->sample_period; - - atomic64_set(&hwc->period_left, hwc->sample_period); - - /* - * we currently do not support PERF_FORMAT_GROUP on inherited counters - */ - if (attr->inherit && (attr->read_format & PERF_FORMAT_GROUP)) - goto done; - - switch (attr->type) { - case PERF_TYPE_RAW: - case PERF_TYPE_HARDWARE: - case PERF_TYPE_HW_CACHE: - pmu = hw_perf_counter_init(counter); - break; - - case PERF_TYPE_SOFTWARE: - pmu = sw_perf_counter_init(counter); - break; - - case PERF_TYPE_TRACEPOINT: - pmu = tp_perf_counter_init(counter); - break; - - default: - break; - } -done: - err = 0; - if (!pmu) - err = -EINVAL; - else if (IS_ERR(pmu)) - err = PTR_ERR(pmu); - - if (err) { - if (counter->ns) - put_pid_ns(counter->ns); - kfree(counter); - return ERR_PTR(err); - } - - counter->pmu = pmu; - - if (!counter->parent) { - atomic_inc(&nr_counters); - if (counter->attr.mmap) - atomic_inc(&nr_mmap_counters); - if (counter->attr.comm) - atomic_inc(&nr_comm_counters); - if (counter->attr.task) - atomic_inc(&nr_task_counters); - } - - return counter; -} - -static int perf_copy_attr(struct perf_counter_attr __user *uattr, - struct perf_counter_attr *attr) -{ - u32 size; - int ret; - - if (!access_ok(VERIFY_WRITE, uattr, PERF_ATTR_SIZE_VER0)) - return -EFAULT; - - /* - * zero the full structure, so that a short copy will be nice. - */ - memset(attr, 0, sizeof(*attr)); - - ret = get_user(size, &uattr->size); - if (ret) - return ret; - - if (size > PAGE_SIZE) /* silly large */ - goto err_size; - - if (!size) /* abi compat */ - size = PERF_ATTR_SIZE_VER0; - - if (size < PERF_ATTR_SIZE_VER0) - goto err_size; - - /* - * If we're handed a bigger struct than we know of, - * ensure all the unknown bits are 0 - i.e. new - * user-space does not rely on any kernel feature - * extensions we dont know about yet. - */ - if (size > sizeof(*attr)) { - unsigned char __user *addr; - unsigned char __user *end; - unsigned char val; - - addr = (void __user *)uattr + sizeof(*attr); - end = (void __user *)uattr + size; - - for (; addr < end; addr++) { - ret = get_user(val, addr); - if (ret) - return ret; - if (val) - goto err_size; - } - size = sizeof(*attr); - } - - ret = copy_from_user(attr, uattr, size); - if (ret) - return -EFAULT; - - /* - * If the type exists, the corresponding creation will verify - * the attr->config. - */ - if (attr->type >= PERF_TYPE_MAX) - return -EINVAL; - - if (attr->__reserved_1 || attr->__reserved_2 || attr->__reserved_3) - return -EINVAL; - - if (attr->sample_type & ~(PERF_SAMPLE_MAX-1)) - return -EINVAL; - - if (attr->read_format & ~(PERF_FORMAT_MAX-1)) - return -EINVAL; - -out: - return ret; - -err_size: - put_user(sizeof(*attr), &uattr->size); - ret = -E2BIG; - goto out; -} - -int perf_counter_set_output(struct perf_counter *counter, int output_fd) -{ - struct perf_counter *output_counter = NULL; - struct file *output_file = NULL; - struct perf_counter *old_output; - int fput_needed = 0; - int ret = -EINVAL; - - if (!output_fd) - goto set; - - output_file = fget_light(output_fd, &fput_needed); - if (!output_file) - return -EBADF; - - if (output_file->f_op != &perf_fops) - goto out; - - output_counter = output_file->private_data; - - /* Don't chain output fds */ - if (output_counter->output) - goto out; - - /* Don't set an output fd when we already have an output channel */ - if (counter->data) - goto out; - - atomic_long_inc(&output_file->f_count); - -set: - mutex_lock(&counter->mmap_mutex); - old_output = counter->output; - rcu_assign_pointer(counter->output, output_counter); - mutex_unlock(&counter->mmap_mutex); - - if (old_output) { - /* - * we need to make sure no existing perf_output_*() - * is still referencing this counter. - */ - synchronize_rcu(); - fput(old_output->filp); - } - - ret = 0; -out: - fput_light(output_file, fput_needed); - return ret; -} - -/** - * sys_perf_counter_open - open a performance counter, associate it to a task/cpu - * - * @attr_uptr: event type attributes for monitoring/sampling - * @pid: target pid - * @cpu: target cpu - * @group_fd: group leader counter fd - */ -SYSCALL_DEFINE5(perf_counter_open, - struct perf_counter_attr __user *, attr_uptr, - pid_t, pid, int, cpu, int, group_fd, unsigned long, flags) -{ - struct perf_counter *counter, *group_leader; - struct perf_counter_attr attr; - struct perf_counter_context *ctx; - struct file *counter_file = NULL; - struct file *group_file = NULL; - int fput_needed = 0; - int fput_needed2 = 0; - int err; - - /* for future expandability... */ - if (flags & ~(PERF_FLAG_FD_NO_GROUP | PERF_FLAG_FD_OUTPUT)) - return -EINVAL; - - err = perf_copy_attr(attr_uptr, &attr); - if (err) - return err; - - if (!attr.exclude_kernel) { - if (perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN)) - return -EACCES; - } - - if (attr.freq) { - if (attr.sample_freq > sysctl_perf_counter_sample_rate) - return -EINVAL; - } - - /* - * Get the target context (task or percpu): - */ - ctx = find_get_context(pid, cpu); - if (IS_ERR(ctx)) - return PTR_ERR(ctx); - - /* - * Look up the group leader (we will attach this counter to it): - */ - group_leader = NULL; - if (group_fd != -1 && !(flags & PERF_FLAG_FD_NO_GROUP)) { - err = -EINVAL; - group_file = fget_light(group_fd, &fput_needed); - if (!group_file) - goto err_put_context; - if (group_file->f_op != &perf_fops) - goto err_put_context; - - group_leader = group_file->private_data; - /* - * Do not allow a recursive hierarchy (this new sibling - * becoming part of another group-sibling): - */ - if (group_leader->group_leader != group_leader) - goto err_put_context; - /* - * Do not allow to attach to a group in a different - * task or CPU context: - */ - if (group_leader->ctx != ctx) - goto err_put_context; - /* - * Only a group leader can be exclusive or pinned - */ - if (attr.exclusive || attr.pinned) - goto err_put_context; - } - - counter = perf_counter_alloc(&attr, cpu, ctx, group_leader, - NULL, GFP_KERNEL); - err = PTR_ERR(counter); - if (IS_ERR(counter)) - goto err_put_context; - - err = anon_inode_getfd("[perf_counter]", &perf_fops, counter, 0); - if (err < 0) - goto err_free_put_context; - - counter_file = fget_light(err, &fput_needed2); - if (!counter_file) - goto err_free_put_context; - - if (flags & PERF_FLAG_FD_OUTPUT) { - err = perf_counter_set_output(counter, group_fd); - if (err) - goto err_fput_free_put_context; - } - - counter->filp = counter_file; - WARN_ON_ONCE(ctx->parent_ctx); - mutex_lock(&ctx->mutex); - perf_install_in_context(ctx, counter, cpu); - ++ctx->generation; - mutex_unlock(&ctx->mutex); - - counter->owner = current; - get_task_struct(current); - mutex_lock(¤t->perf_counter_mutex); - list_add_tail(&counter->owner_entry, ¤t->perf_counter_list); - mutex_unlock(¤t->perf_counter_mutex); - -err_fput_free_put_context: - fput_light(counter_file, fput_needed2); - -err_free_put_context: - if (err < 0) - kfree(counter); - -err_put_context: - if (err < 0) - put_ctx(ctx); - - fput_light(group_file, fput_needed); - - return err; -} - -/* - * inherit a counter from parent task to child task: - */ -static struct perf_counter * -inherit_counter(struct perf_counter *parent_counter, - struct task_struct *parent, - struct perf_counter_context *parent_ctx, - struct task_struct *child, - struct perf_counter *group_leader, - struct perf_counter_context *child_ctx) -{ - struct perf_counter *child_counter; - - /* - * Instead of creating recursive hierarchies of counters, - * we link inherited counters back to the original parent, - * which has a filp for sure, which we use as the reference - * count: - */ - if (parent_counter->parent) - parent_counter = parent_counter->parent; - - child_counter = perf_counter_alloc(&parent_counter->attr, - parent_counter->cpu, child_ctx, - group_leader, parent_counter, - GFP_KERNEL); - if (IS_ERR(child_counter)) - return child_counter; - get_ctx(child_ctx); - - /* - * Make the child state follow the state of the parent counter, - * not its attr.disabled bit. We hold the parent's mutex, - * so we won't race with perf_counter_{en, dis}able_family. - */ - if (parent_counter->state >= PERF_COUNTER_STATE_INACTIVE) - child_counter->state = PERF_COUNTER_STATE_INACTIVE; - else - child_counter->state = PERF_COUNTER_STATE_OFF; - - if (parent_counter->attr.freq) - child_counter->hw.sample_period = parent_counter->hw.sample_period; - - /* - * Link it up in the child's context: - */ - add_counter_to_ctx(child_counter, child_ctx); - - /* - * Get a reference to the parent filp - we will fput it - * when the child counter exits. This is safe to do because - * we are in the parent and we know that the filp still - * exists and has a nonzero count: - */ - atomic_long_inc(&parent_counter->filp->f_count); - - /* - * Link this into the parent counter's child list - */ - WARN_ON_ONCE(parent_counter->ctx->parent_ctx); - mutex_lock(&parent_counter->child_mutex); - list_add_tail(&child_counter->child_list, &parent_counter->child_list); - mutex_unlock(&parent_counter->child_mutex); - - return child_counter; -} - -static int inherit_group(struct perf_counter *parent_counter, - struct task_struct *parent, - struct perf_counter_context *parent_ctx, - struct task_struct *child, - struct perf_counter_context *child_ctx) -{ - struct perf_counter *leader; - struct perf_counter *sub; - struct perf_counter *child_ctr; - - leader = inherit_counter(parent_counter, parent, parent_ctx, - child, NULL, child_ctx); - if (IS_ERR(leader)) - return PTR_ERR(leader); - list_for_each_entry(sub, &parent_counter->sibling_list, group_entry) { - child_ctr = inherit_counter(sub, parent, parent_ctx, - child, leader, child_ctx); - if (IS_ERR(child_ctr)) - return PTR_ERR(child_ctr); - } - return 0; -} - -static void sync_child_counter(struct perf_counter *child_counter, - struct task_struct *child) -{ - struct perf_counter *parent_counter = child_counter->parent; - u64 child_val; - - if (child_counter->attr.inherit_stat) - perf_counter_read_event(child_counter, child); - - child_val = atomic64_read(&child_counter->count); - - /* - * Add back the child's count to the parent's count: - */ - atomic64_add(child_val, &parent_counter->count); - atomic64_add(child_counter->total_time_enabled, - &parent_counter->child_total_time_enabled); - atomic64_add(child_counter->total_time_running, - &parent_counter->child_total_time_running); - - /* - * Remove this counter from the parent's list - */ - WARN_ON_ONCE(parent_counter->ctx->parent_ctx); - mutex_lock(&parent_counter->child_mutex); - list_del_init(&child_counter->child_list); - mutex_unlock(&parent_counter->child_mutex); - - /* - * Release the parent counter, if this was the last - * reference to it. - */ - fput(parent_counter->filp); -} - -static void -__perf_counter_exit_task(struct perf_counter *child_counter, - struct perf_counter_context *child_ctx, - struct task_struct *child) -{ - struct perf_counter *parent_counter; - - update_counter_times(child_counter); - perf_counter_remove_from_context(child_counter); - - parent_counter = child_counter->parent; - /* - * It can happen that parent exits first, and has counters - * that are still around due to the child reference. These - * counters need to be zapped - but otherwise linger. - */ - if (parent_counter) { - sync_child_counter(child_counter, child); - free_counter(child_counter); - } -} - -/* - * When a child task exits, feed back counter values to parent counters. - */ -void perf_counter_exit_task(struct task_struct *child) -{ - struct perf_counter *child_counter, *tmp; - struct perf_counter_context *child_ctx; - unsigned long flags; - - if (likely(!child->perf_counter_ctxp)) { - perf_counter_task(child, NULL, 0); - return; - } - - local_irq_save(flags); - /* - * We can't reschedule here because interrupts are disabled, - * and either child is current or it is a task that can't be - * scheduled, so we are now safe from rescheduling changing - * our context. - */ - child_ctx = child->perf_counter_ctxp; - __perf_counter_task_sched_out(child_ctx); - - /* - * Take the context lock here so that if find_get_context is - * reading child->perf_counter_ctxp, we wait until it has - * incremented the context's refcount before we do put_ctx below. - */ - spin_lock(&child_ctx->lock); - child->perf_counter_ctxp = NULL; - /* - * If this context is a clone; unclone it so it can't get - * swapped to another process while we're removing all - * the counters from it. - */ - unclone_ctx(child_ctx); - spin_unlock_irqrestore(&child_ctx->lock, flags); - - /* - * Report the task dead after unscheduling the counters so that we - * won't get any samples after PERF_EVENT_EXIT. We can however still - * get a few PERF_EVENT_READ events. - */ - perf_counter_task(child, child_ctx, 0); - - /* - * We can recurse on the same lock type through: - * - * __perf_counter_exit_task() - * sync_child_counter() - * fput(parent_counter->filp) - * perf_release() - * mutex_lock(&ctx->mutex) - * - * But since its the parent context it won't be the same instance. - */ - mutex_lock_nested(&child_ctx->mutex, SINGLE_DEPTH_NESTING); - -again: - list_for_each_entry_safe(child_counter, tmp, &child_ctx->group_list, - group_entry) - __perf_counter_exit_task(child_counter, child_ctx, child); - - /* - * If the last counter was a group counter, it will have appended all - * its siblings to the list, but we obtained 'tmp' before that which - * will still point to the list head terminating the iteration. - */ - if (!list_empty(&child_ctx->group_list)) - goto again; - - mutex_unlock(&child_ctx->mutex); - - put_ctx(child_ctx); -} - -/* - * free an unexposed, unused context as created by inheritance by - * init_task below, used by fork() in case of fail. - */ -void perf_counter_free_task(struct task_struct *task) -{ - struct perf_counter_context *ctx = task->perf_counter_ctxp; - struct perf_counter *counter, *tmp; - - if (!ctx) - return; - - mutex_lock(&ctx->mutex); -again: - list_for_each_entry_safe(counter, tmp, &ctx->group_list, group_entry) { - struct perf_counter *parent = counter->parent; - - if (WARN_ON_ONCE(!parent)) - continue; - - mutex_lock(&parent->child_mutex); - list_del_init(&counter->child_list); - mutex_unlock(&parent->child_mutex); - - fput(parent->filp); - - list_del_counter(counter, ctx); - free_counter(counter); - } - - if (!list_empty(&ctx->group_list)) - goto again; - - mutex_unlock(&ctx->mutex); - - put_ctx(ctx); -} - -/* - * Initialize the perf_counter context in task_struct - */ -int perf_counter_init_task(struct task_struct *child) -{ - struct perf_counter_context *child_ctx, *parent_ctx; - struct perf_counter_context *cloned_ctx; - struct perf_counter *counter; - struct task_struct *parent = current; - int inherited_all = 1; - int ret = 0; - - child->perf_counter_ctxp = NULL; - - mutex_init(&child->perf_counter_mutex); - INIT_LIST_HEAD(&child->perf_counter_list); - - if (likely(!parent->perf_counter_ctxp)) - return 0; - - /* - * This is executed from the parent task context, so inherit - * counters that have been marked for cloning. - * First allocate and initialize a context for the child. - */ - - child_ctx = kmalloc(sizeof(struct perf_counter_context), GFP_KERNEL); - if (!child_ctx) - return -ENOMEM; - - __perf_counter_init_context(child_ctx, child); - child->perf_counter_ctxp = child_ctx; - get_task_struct(child); - - /* - * If the parent's context is a clone, pin it so it won't get - * swapped under us. - */ - parent_ctx = perf_pin_task_context(parent); - - /* - * No need to check if parent_ctx != NULL here; since we saw - * it non-NULL earlier, the only reason for it to become NULL - * is if we exit, and since we're currently in the middle of - * a fork we can't be exiting at the same time. - */ - - /* - * Lock the parent list. No need to lock the child - not PID - * hashed yet and not running, so nobody can access it. - */ - mutex_lock(&parent_ctx->mutex); - - /* - * We dont have to disable NMIs - we are only looking at - * the list, not manipulating it: - */ - list_for_each_entry_rcu(counter, &parent_ctx->event_list, event_entry) { - if (counter != counter->group_leader) - continue; - - if (!counter->attr.inherit) { - inherited_all = 0; - continue; - } - - ret = inherit_group(counter, parent, parent_ctx, - child, child_ctx); - if (ret) { - inherited_all = 0; - break; - } - } - - if (inherited_all) { - /* - * Mark the child context as a clone of the parent - * context, or of whatever the parent is a clone of. - * Note that if the parent is a clone, it could get - * uncloned at any point, but that doesn't matter - * because the list of counters and the generation - * count can't have changed since we took the mutex. - */ - cloned_ctx = rcu_dereference(parent_ctx->parent_ctx); - if (cloned_ctx) { - child_ctx->parent_ctx = cloned_ctx; - child_ctx->parent_gen = parent_ctx->parent_gen; - } else { - child_ctx->parent_ctx = parent_ctx; - child_ctx->parent_gen = parent_ctx->generation; - } - get_ctx(child_ctx->parent_ctx); - } - - mutex_unlock(&parent_ctx->mutex); - - perf_unpin_context(parent_ctx); - - return ret; -} - -static void __cpuinit perf_counter_init_cpu(int cpu) -{ - struct perf_cpu_context *cpuctx; - - cpuctx = &per_cpu(perf_cpu_context, cpu); - __perf_counter_init_context(&cpuctx->ctx, NULL); - - spin_lock(&perf_resource_lock); - cpuctx->max_pertask = perf_max_counters - perf_reserved_percpu; - spin_unlock(&perf_resource_lock); - - hw_perf_counter_setup(cpu); -} - -#ifdef CONFIG_HOTPLUG_CPU -static void __perf_counter_exit_cpu(void *info) -{ - struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); - struct perf_counter_context *ctx = &cpuctx->ctx; - struct perf_counter *counter, *tmp; - - list_for_each_entry_safe(counter, tmp, &ctx->group_list, group_entry) - __perf_counter_remove_from_context(counter); -} -static void perf_counter_exit_cpu(int cpu) -{ - struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu); - struct perf_counter_context *ctx = &cpuctx->ctx; - - mutex_lock(&ctx->mutex); - smp_call_function_single(cpu, __perf_counter_exit_cpu, NULL, 1); - mutex_unlock(&ctx->mutex); -} -#else -static inline void perf_counter_exit_cpu(int cpu) { } -#endif - -static int __cpuinit -perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu) -{ - unsigned int cpu = (long)hcpu; - - switch (action) { - - case CPU_UP_PREPARE: - case CPU_UP_PREPARE_FROZEN: - perf_counter_init_cpu(cpu); - break; - - case CPU_ONLINE: - case CPU_ONLINE_FROZEN: - hw_perf_counter_setup_online(cpu); - break; - - case CPU_DOWN_PREPARE: - case CPU_DOWN_PREPARE_FROZEN: - perf_counter_exit_cpu(cpu); - break; - - default: - break; - } - - return NOTIFY_OK; -} - -/* - * This has to have a higher priority than migration_notifier in sched.c. - */ -static struct notifier_block __cpuinitdata perf_cpu_nb = { - .notifier_call = perf_cpu_notify, - .priority = 20, -}; - -void __init perf_counter_init(void) -{ - perf_cpu_notify(&perf_cpu_nb, (unsigned long)CPU_UP_PREPARE, - (void *)(long)smp_processor_id()); - perf_cpu_notify(&perf_cpu_nb, (unsigned long)CPU_ONLINE, - (void *)(long)smp_processor_id()); - register_cpu_notifier(&perf_cpu_nb); -} - -static ssize_t perf_show_reserve_percpu(struct sysdev_class *class, char *buf) -{ - return sprintf(buf, "%d\n", perf_reserved_percpu); -} - -static ssize_t -perf_set_reserve_percpu(struct sysdev_class *class, - const char *buf, - size_t count) -{ - struct perf_cpu_context *cpuctx; - unsigned long val; - int err, cpu, mpt; - - err = strict_strtoul(buf, 10, &val); - if (err) - return err; - if (val > perf_max_counters) - return -EINVAL; - - spin_lock(&perf_resource_lock); - perf_reserved_percpu = val; - for_each_online_cpu(cpu) { - cpuctx = &per_cpu(perf_cpu_context, cpu); - spin_lock_irq(&cpuctx->ctx.lock); - mpt = min(perf_max_counters - cpuctx->ctx.nr_counters, - perf_max_counters - perf_reserved_percpu); - cpuctx->max_pertask = mpt; - spin_unlock_irq(&cpuctx->ctx.lock); - } - spin_unlock(&perf_resource_lock); - - return count; -} - -static ssize_t perf_show_overcommit(struct sysdev_class *class, char *buf) -{ - return sprintf(buf, "%d\n", perf_overcommit); -} - -static ssize_t -perf_set_overcommit(struct sysdev_class *class, const char *buf, size_t count) -{ - unsigned long val; - int err; - - err = strict_strtoul(buf, 10, &val); - if (err) - return err; - if (val > 1) - return -EINVAL; - - spin_lock(&perf_resource_lock); - perf_overcommit = val; - spin_unlock(&perf_resource_lock); - - return count; -} - -static SYSDEV_CLASS_ATTR( - reserve_percpu, - 0644, - perf_show_reserve_percpu, - perf_set_reserve_percpu - ); - -static SYSDEV_CLASS_ATTR( - overcommit, - 0644, - perf_show_overcommit, - perf_set_overcommit - ); - -static struct attribute *perfclass_attrs[] = { - &attr_reserve_percpu.attr, - &attr_overcommit.attr, - NULL -}; - -static struct attribute_group perfclass_attr_group = { - .attrs = perfclass_attrs, - .name = "perf_counters", -}; - -static int __init perf_counter_sysfs_init(void) -{ - return sysfs_create_group(&cpu_sysdev_class.kset.kobj, - &perfclass_attr_group); -} -device_initcall(perf_counter_sysfs_init); diff --git a/kernel/perf_event.c b/kernel/perf_event.c new file mode 100644 index 000000000000..6e8b99a04e1e --- /dev/null +++ b/kernel/perf_event.c @@ -0,0 +1,5000 @@ +/* + * Performance event core code + * + * Copyright (C) 2008 Thomas Gleixner + * Copyright (C) 2008-2009 Red Hat, Inc., Ingo Molnar + * Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra + * Copyright © 2009 Paul Mackerras, IBM Corp. + * + * For licensing details see kernel-base/COPYING + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +/* + * Each CPU has a list of per CPU events: + */ +DEFINE_PER_CPU(struct perf_cpu_context, perf_cpu_context); + +int perf_max_events __read_mostly = 1; +static int perf_reserved_percpu __read_mostly; +static int perf_overcommit __read_mostly = 1; + +static atomic_t nr_events __read_mostly; +static atomic_t nr_mmap_events __read_mostly; +static atomic_t nr_comm_events __read_mostly; +static atomic_t nr_task_events __read_mostly; + +/* + * perf event paranoia level: + * -1 - not paranoid at all + * 0 - disallow raw tracepoint access for unpriv + * 1 - disallow cpu events for unpriv + * 2 - disallow kernel profiling for unpriv + */ +int sysctl_perf_event_paranoid __read_mostly = 1; + +static inline bool perf_paranoid_tracepoint_raw(void) +{ + return sysctl_perf_event_paranoid > -1; +} + +static inline bool perf_paranoid_cpu(void) +{ + return sysctl_perf_event_paranoid > 0; +} + +static inline bool perf_paranoid_kernel(void) +{ + return sysctl_perf_event_paranoid > 1; +} + +int sysctl_perf_event_mlock __read_mostly = 512; /* 'free' kb per user */ + +/* + * max perf event sample rate + */ +int sysctl_perf_event_sample_rate __read_mostly = 100000; + +static atomic64_t perf_event_id; + +/* + * Lock for (sysadmin-configurable) event reservations: + */ +static DEFINE_SPINLOCK(perf_resource_lock); + +/* + * Architecture provided APIs - weak aliases: + */ +extern __weak const struct pmu *hw_perf_event_init(struct perf_event *event) +{ + return NULL; +} + +void __weak hw_perf_disable(void) { barrier(); } +void __weak hw_perf_enable(void) { barrier(); } + +void __weak hw_perf_event_setup(int cpu) { barrier(); } +void __weak hw_perf_event_setup_online(int cpu) { barrier(); } + +int __weak +hw_perf_group_sched_in(struct perf_event *group_leader, + struct perf_cpu_context *cpuctx, + struct perf_event_context *ctx, int cpu) +{ + return 0; +} + +void __weak perf_event_print_debug(void) { } + +static DEFINE_PER_CPU(int, perf_disable_count); + +void __perf_disable(void) +{ + __get_cpu_var(perf_disable_count)++; +} + +bool __perf_enable(void) +{ + return !--__get_cpu_var(perf_disable_count); +} + +void perf_disable(void) +{ + __perf_disable(); + hw_perf_disable(); +} + +void perf_enable(void) +{ + if (__perf_enable()) + hw_perf_enable(); +} + +static void get_ctx(struct perf_event_context *ctx) +{ + WARN_ON(!atomic_inc_not_zero(&ctx->refcount)); +} + +static void free_ctx(struct rcu_head *head) +{ + struct perf_event_context *ctx; + + ctx = container_of(head, struct perf_event_context, rcu_head); + kfree(ctx); +} + +static void put_ctx(struct perf_event_context *ctx) +{ + if (atomic_dec_and_test(&ctx->refcount)) { + if (ctx->parent_ctx) + put_ctx(ctx->parent_ctx); + if (ctx->task) + put_task_struct(ctx->task); + call_rcu(&ctx->rcu_head, free_ctx); + } +} + +static void unclone_ctx(struct perf_event_context *ctx) +{ + if (ctx->parent_ctx) { + put_ctx(ctx->parent_ctx); + ctx->parent_ctx = NULL; + } +} + +/* + * If we inherit events we want to return the parent event id + * to userspace. + */ +static u64 primary_event_id(struct perf_event *event) +{ + u64 id = event->id; + + if (event->parent) + id = event->parent->id; + + return id; +} + +/* + * Get the perf_event_context for a task and lock it. + * This has to cope with with the fact that until it is locked, + * the context could get moved to another task. + */ +static struct perf_event_context * +perf_lock_task_context(struct task_struct *task, unsigned long *flags) +{ + struct perf_event_context *ctx; + + rcu_read_lock(); + retry: + ctx = rcu_dereference(task->perf_event_ctxp); + if (ctx) { + /* + * If this context is a clone of another, it might + * get swapped for another underneath us by + * perf_event_task_sched_out, though the + * rcu_read_lock() protects us from any context + * getting freed. Lock the context and check if it + * got swapped before we could get the lock, and retry + * if so. If we locked the right context, then it + * can't get swapped on us any more. + */ + spin_lock_irqsave(&ctx->lock, *flags); + if (ctx != rcu_dereference(task->perf_event_ctxp)) { + spin_unlock_irqrestore(&ctx->lock, *flags); + goto retry; + } + + if (!atomic_inc_not_zero(&ctx->refcount)) { + spin_unlock_irqrestore(&ctx->lock, *flags); + ctx = NULL; + } + } + rcu_read_unlock(); + return ctx; +} + +/* + * Get the context for a task and increment its pin_count so it + * can't get swapped to another task. This also increments its + * reference count so that the context can't get freed. + */ +static struct perf_event_context *perf_pin_task_context(struct task_struct *task) +{ + struct perf_event_context *ctx; + unsigned long flags; + + ctx = perf_lock_task_context(task, &flags); + if (ctx) { + ++ctx->pin_count; + spin_unlock_irqrestore(&ctx->lock, flags); + } + return ctx; +} + +static void perf_unpin_context(struct perf_event_context *ctx) +{ + unsigned long flags; + + spin_lock_irqsave(&ctx->lock, flags); + --ctx->pin_count; + spin_unlock_irqrestore(&ctx->lock, flags); + put_ctx(ctx); +} + +/* + * Add a event from the lists for its context. + * Must be called with ctx->mutex and ctx->lock held. + */ +static void +list_add_event(struct perf_event *event, struct perf_event_context *ctx) +{ + struct perf_event *group_leader = event->group_leader; + + /* + * Depending on whether it is a standalone or sibling event, + * add it straight to the context's event list, or to the group + * leader's sibling list: + */ + if (group_leader == event) + list_add_tail(&event->group_entry, &ctx->group_list); + else { + list_add_tail(&event->group_entry, &group_leader->sibling_list); + group_leader->nr_siblings++; + } + + list_add_rcu(&event->event_entry, &ctx->event_list); + ctx->nr_events++; + if (event->attr.inherit_stat) + ctx->nr_stat++; +} + +/* + * Remove a event from the lists for its context. + * Must be called with ctx->mutex and ctx->lock held. + */ +static void +list_del_event(struct perf_event *event, struct perf_event_context *ctx) +{ + struct perf_event *sibling, *tmp; + + if (list_empty(&event->group_entry)) + return; + ctx->nr_events--; + if (event->attr.inherit_stat) + ctx->nr_stat--; + + list_del_init(&event->group_entry); + list_del_rcu(&event->event_entry); + + if (event->group_leader != event) + event->group_leader->nr_siblings--; + + /* + * If this was a group event with sibling events then + * upgrade the siblings to singleton events by adding them + * to the context list directly: + */ + list_for_each_entry_safe(sibling, tmp, &event->sibling_list, group_entry) { + + list_move_tail(&sibling->group_entry, &ctx->group_list); + sibling->group_leader = sibling; + } +} + +static void +event_sched_out(struct perf_event *event, + struct perf_cpu_context *cpuctx, + struct perf_event_context *ctx) +{ + if (event->state != PERF_EVENT_STATE_ACTIVE) + return; + + event->state = PERF_EVENT_STATE_INACTIVE; + if (event->pending_disable) { + event->pending_disable = 0; + event->state = PERF_EVENT_STATE_OFF; + } + event->tstamp_stopped = ctx->time; + event->pmu->disable(event); + event->oncpu = -1; + + if (!is_software_event(event)) + cpuctx->active_oncpu--; + ctx->nr_active--; + if (event->attr.exclusive || !cpuctx->active_oncpu) + cpuctx->exclusive = 0; +} + +static void +group_sched_out(struct perf_event *group_event, + struct perf_cpu_context *cpuctx, + struct perf_event_context *ctx) +{ + struct perf_event *event; + + if (group_event->state != PERF_EVENT_STATE_ACTIVE) + return; + + event_sched_out(group_event, cpuctx, ctx); + + /* + * Schedule out siblings (if any): + */ + list_for_each_entry(event, &group_event->sibling_list, group_entry) + event_sched_out(event, cpuctx, ctx); + + if (group_event->attr.exclusive) + cpuctx->exclusive = 0; +} + +/* + * Cross CPU call to remove a performance event + * + * We disable the event on the hardware level first. After that we + * remove it from the context list. + */ +static void __perf_event_remove_from_context(void *info) +{ + struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); + struct perf_event *event = info; + struct perf_event_context *ctx = event->ctx; + + /* + * If this is a task context, we need to check whether it is + * the current task context of this cpu. If not it has been + * scheduled out before the smp call arrived. + */ + if (ctx->task && cpuctx->task_ctx != ctx) + return; + + spin_lock(&ctx->lock); + /* + * Protect the list operation against NMI by disabling the + * events on a global level. + */ + perf_disable(); + + event_sched_out(event, cpuctx, ctx); + + list_del_event(event, ctx); + + if (!ctx->task) { + /* + * Allow more per task events with respect to the + * reservation: + */ + cpuctx->max_pertask = + min(perf_max_events - ctx->nr_events, + perf_max_events - perf_reserved_percpu); + } + + perf_enable(); + spin_unlock(&ctx->lock); +} + + +/* + * Remove the event from a task's (or a CPU's) list of events. + * + * Must be called with ctx->mutex held. + * + * CPU events are removed with a smp call. For task events we only + * call when the task is on a CPU. + * + * If event->ctx is a cloned context, callers must make sure that + * every task struct that event->ctx->task could possibly point to + * remains valid. This is OK when called from perf_release since + * that only calls us on the top-level context, which can't be a clone. + * When called from perf_event_exit_task, it's OK because the + * context has been detached from its task. + */ +static void perf_event_remove_from_context(struct perf_event *event) +{ + struct perf_event_context *ctx = event->ctx; + struct task_struct *task = ctx->task; + + if (!task) { + /* + * Per cpu events are removed via an smp call and + * the removal is always sucessful. + */ + smp_call_function_single(event->cpu, + __perf_event_remove_from_context, + event, 1); + return; + } + +retry: + task_oncpu_function_call(task, __perf_event_remove_from_context, + event); + + spin_lock_irq(&ctx->lock); + /* + * If the context is active we need to retry the smp call. + */ + if (ctx->nr_active && !list_empty(&event->group_entry)) { + spin_unlock_irq(&ctx->lock); + goto retry; + } + + /* + * The lock prevents that this context is scheduled in so we + * can remove the event safely, if the call above did not + * succeed. + */ + if (!list_empty(&event->group_entry)) { + list_del_event(event, ctx); + } + spin_unlock_irq(&ctx->lock); +} + +static inline u64 perf_clock(void) +{ + return cpu_clock(smp_processor_id()); +} + +/* + * Update the record of the current time in a context. + */ +static void update_context_time(struct perf_event_context *ctx) +{ + u64 now = perf_clock(); + + ctx->time += now - ctx->timestamp; + ctx->timestamp = now; +} + +/* + * Update the total_time_enabled and total_time_running fields for a event. + */ +static void update_event_times(struct perf_event *event) +{ + struct perf_event_context *ctx = event->ctx; + u64 run_end; + + if (event->state < PERF_EVENT_STATE_INACTIVE || + event->group_leader->state < PERF_EVENT_STATE_INACTIVE) + return; + + event->total_time_enabled = ctx->time - event->tstamp_enabled; + + if (event->state == PERF_EVENT_STATE_INACTIVE) + run_end = event->tstamp_stopped; + else + run_end = ctx->time; + + event->total_time_running = run_end - event->tstamp_running; +} + +/* + * Update total_time_enabled and total_time_running for all events in a group. + */ +static void update_group_times(struct perf_event *leader) +{ + struct perf_event *event; + + update_event_times(leader); + list_for_each_entry(event, &leader->sibling_list, group_entry) + update_event_times(event); +} + +/* + * Cross CPU call to disable a performance event + */ +static void __perf_event_disable(void *info) +{ + struct perf_event *event = info; + struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); + struct perf_event_context *ctx = event->ctx; + + /* + * If this is a per-task event, need to check whether this + * event's task is the current task on this cpu. + */ + if (ctx->task && cpuctx->task_ctx != ctx) + return; + + spin_lock(&ctx->lock); + + /* + * If the event is on, turn it off. + * If it is in error state, leave it in error state. + */ + if (event->state >= PERF_EVENT_STATE_INACTIVE) { + update_context_time(ctx); + update_group_times(event); + if (event == event->group_leader) + group_sched_out(event, cpuctx, ctx); + else + event_sched_out(event, cpuctx, ctx); + event->state = PERF_EVENT_STATE_OFF; + } + + spin_unlock(&ctx->lock); +} + +/* + * Disable a event. + * + * If event->ctx is a cloned context, callers must make sure that + * every task struct that event->ctx->task could possibly point to + * remains valid. This condition is satisifed when called through + * perf_event_for_each_child or perf_event_for_each because they + * hold the top-level event's child_mutex, so any descendant that + * goes to exit will block in sync_child_event. + * When called from perf_pending_event it's OK because event->ctx + * is the current context on this CPU and preemption is disabled, + * hence we can't get into perf_event_task_sched_out for this context. + */ +static void perf_event_disable(struct perf_event *event) +{ + struct perf_event_context *ctx = event->ctx; + struct task_struct *task = ctx->task; + + if (!task) { + /* + * Disable the event on the cpu that it's on + */ + smp_call_function_single(event->cpu, __perf_event_disable, + event, 1); + return; + } + + retry: + task_oncpu_function_call(task, __perf_event_disable, event); + + spin_lock_irq(&ctx->lock); + /* + * If the event is still active, we need to retry the cross-call. + */ + if (event->state == PERF_EVENT_STATE_ACTIVE) { + spin_unlock_irq(&ctx->lock); + goto retry; + } + + /* + * Since we have the lock this context can't be scheduled + * in, so we can change the state safely. + */ + if (event->state == PERF_EVENT_STATE_INACTIVE) { + update_group_times(event); + event->state = PERF_EVENT_STATE_OFF; + } + + spin_unlock_irq(&ctx->lock); +} + +static int +event_sched_in(struct perf_event *event, + struct perf_cpu_context *cpuctx, + struct perf_event_context *ctx, + int cpu) +{ + if (event->state <= PERF_EVENT_STATE_OFF) + return 0; + + event->state = PERF_EVENT_STATE_ACTIVE; + event->oncpu = cpu; /* TODO: put 'cpu' into cpuctx->cpu */ + /* + * The new state must be visible before we turn it on in the hardware: + */ + smp_wmb(); + + if (event->pmu->enable(event)) { + event->state = PERF_EVENT_STATE_INACTIVE; + event->oncpu = -1; + return -EAGAIN; + } + + event->tstamp_running += ctx->time - event->tstamp_stopped; + + if (!is_software_event(event)) + cpuctx->active_oncpu++; + ctx->nr_active++; + + if (event->attr.exclusive) + cpuctx->exclusive = 1; + + return 0; +} + +static int +group_sched_in(struct perf_event *group_event, + struct perf_cpu_context *cpuctx, + struct perf_event_context *ctx, + int cpu) +{ + struct perf_event *event, *partial_group; + int ret; + + if (group_event->state == PERF_EVENT_STATE_OFF) + return 0; + + ret = hw_perf_group_sched_in(group_event, cpuctx, ctx, cpu); + if (ret) + return ret < 0 ? ret : 0; + + if (event_sched_in(group_event, cpuctx, ctx, cpu)) + return -EAGAIN; + + /* + * Schedule in siblings as one group (if any): + */ + list_for_each_entry(event, &group_event->sibling_list, group_entry) { + if (event_sched_in(event, cpuctx, ctx, cpu)) { + partial_group = event; + goto group_error; + } + } + + return 0; + +group_error: + /* + * Groups can be scheduled in as one unit only, so undo any + * partial group before returning: + */ + list_for_each_entry(event, &group_event->sibling_list, group_entry) { + if (event == partial_group) + break; + event_sched_out(event, cpuctx, ctx); + } + event_sched_out(group_event, cpuctx, ctx); + + return -EAGAIN; +} + +/* + * Return 1 for a group consisting entirely of software events, + * 0 if the group contains any hardware events. + */ +static int is_software_only_group(struct perf_event *leader) +{ + struct perf_event *event; + + if (!is_software_event(leader)) + return 0; + + list_for_each_entry(event, &leader->sibling_list, group_entry) + if (!is_software_event(event)) + return 0; + + return 1; +} + +/* + * Work out whether we can put this event group on the CPU now. + */ +static int group_can_go_on(struct perf_event *event, + struct perf_cpu_context *cpuctx, + int can_add_hw) +{ + /* + * Groups consisting entirely of software events can always go on. + */ + if (is_software_only_group(event)) + return 1; + /* + * If an exclusive group is already on, no other hardware + * events can go on. + */ + if (cpuctx->exclusive) + return 0; + /* + * If this group is exclusive and there are already + * events on the CPU, it can't go on. + */ + if (event->attr.exclusive && cpuctx->active_oncpu) + return 0; + /* + * Otherwise, try to add it if all previous groups were able + * to go on. + */ + return can_add_hw; +} + +static void add_event_to_ctx(struct perf_event *event, + struct perf_event_context *ctx) +{ + list_add_event(event, ctx); + event->tstamp_enabled = ctx->time; + event->tstamp_running = ctx->time; + event->tstamp_stopped = ctx->time; +} + +/* + * Cross CPU call to install and enable a performance event + * + * Must be called with ctx->mutex held + */ +static void __perf_install_in_context(void *info) +{ + struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); + struct perf_event *event = info; + struct perf_event_context *ctx = event->ctx; + struct perf_event *leader = event->group_leader; + int cpu = smp_processor_id(); + int err; + + /* + * If this is a task context, we need to check whether it is + * the current task context of this cpu. If not it has been + * scheduled out before the smp call arrived. + * Or possibly this is the right context but it isn't + * on this cpu because it had no events. + */ + if (ctx->task && cpuctx->task_ctx != ctx) { + if (cpuctx->task_ctx || ctx->task != current) + return; + cpuctx->task_ctx = ctx; + } + + spin_lock(&ctx->lock); + ctx->is_active = 1; + update_context_time(ctx); + + /* + * Protect the list operation against NMI by disabling the + * events on a global level. NOP for non NMI based events. + */ + perf_disable(); + + add_event_to_ctx(event, ctx); + + /* + * Don't put the event on if it is disabled or if + * it is in a group and the group isn't on. + */ + if (event->state != PERF_EVENT_STATE_INACTIVE || + (leader != event && leader->state != PERF_EVENT_STATE_ACTIVE)) + goto unlock; + + /* + * An exclusive event can't go on if there are already active + * hardware events, and no hardware event can go on if there + * is already an exclusive event on. + */ + if (!group_can_go_on(event, cpuctx, 1)) + err = -EEXIST; + else + err = event_sched_in(event, cpuctx, ctx, cpu); + + if (err) { + /* + * This event couldn't go on. If it is in a group + * then we have to pull the whole group off. + * If the event group is pinned then put it in error state. + */ + if (leader != event) + group_sched_out(leader, cpuctx, ctx); + if (leader->attr.pinned) { + update_group_times(leader); + leader->state = PERF_EVENT_STATE_ERROR; + } + } + + if (!err && !ctx->task && cpuctx->max_pertask) + cpuctx->max_pertask--; + + unlock: + perf_enable(); + + spin_unlock(&ctx->lock); +} + +/* + * Attach a performance event to a context + * + * First we add the event to the list with the hardware enable bit + * in event->hw_config cleared. + * + * If the event is attached to a task which is on a CPU we use a smp + * call to enable it in the task context. The task might have been + * scheduled away, but we check this in the smp call again. + * + * Must be called with ctx->mutex held. + */ +static void +perf_install_in_context(struct perf_event_context *ctx, + struct perf_event *event, + int cpu) +{ + struct task_struct *task = ctx->task; + + if (!task) { + /* + * Per cpu events are installed via an smp call and + * the install is always sucessful. + */ + smp_call_function_single(cpu, __perf_install_in_context, + event, 1); + return; + } + +retry: + task_oncpu_function_call(task, __perf_install_in_context, + event); + + spin_lock_irq(&ctx->lock); + /* + * we need to retry the smp call. + */ + if (ctx->is_active && list_empty(&event->group_entry)) { + spin_unlock_irq(&ctx->lock); + goto retry; + } + + /* + * The lock prevents that this context is scheduled in so we + * can add the event safely, if it the call above did not + * succeed. + */ + if (list_empty(&event->group_entry)) + add_event_to_ctx(event, ctx); + spin_unlock_irq(&ctx->lock); +} + +/* + * Put a event into inactive state and update time fields. + * Enabling the leader of a group effectively enables all + * the group members that aren't explicitly disabled, so we + * have to update their ->tstamp_enabled also. + * Note: this works for group members as well as group leaders + * since the non-leader members' sibling_lists will be empty. + */ +static void __perf_event_mark_enabled(struct perf_event *event, + struct perf_event_context *ctx) +{ + struct perf_event *sub; + + event->state = PERF_EVENT_STATE_INACTIVE; + event->tstamp_enabled = ctx->time - event->total_time_enabled; + list_for_each_entry(sub, &event->sibling_list, group_entry) + if (sub->state >= PERF_EVENT_STATE_INACTIVE) + sub->tstamp_enabled = + ctx->time - sub->total_time_enabled; +} + +/* + * Cross CPU call to enable a performance event + */ +static void __perf_event_enable(void *info) +{ + struct perf_event *event = info; + struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); + struct perf_event_context *ctx = event->ctx; + struct perf_event *leader = event->group_leader; + int err; + + /* + * If this is a per-task event, need to check whether this + * event's task is the current task on this cpu. + */ + if (ctx->task && cpuctx->task_ctx != ctx) { + if (cpuctx->task_ctx || ctx->task != current) + return; + cpuctx->task_ctx = ctx; + } + + spin_lock(&ctx->lock); + ctx->is_active = 1; + update_context_time(ctx); + + if (event->state >= PERF_EVENT_STATE_INACTIVE) + goto unlock; + __perf_event_mark_enabled(event, ctx); + + /* + * If the event is in a group and isn't the group leader, + * then don't put it on unless the group is on. + */ + if (leader != event && leader->state != PERF_EVENT_STATE_ACTIVE) + goto unlock; + + if (!group_can_go_on(event, cpuctx, 1)) { + err = -EEXIST; + } else { + perf_disable(); + if (event == leader) + err = group_sched_in(event, cpuctx, ctx, + smp_processor_id()); + else + err = event_sched_in(event, cpuctx, ctx, + smp_processor_id()); + perf_enable(); + } + + if (err) { + /* + * If this event can't go on and it's part of a + * group, then the whole group has to come off. + */ + if (leader != event) + group_sched_out(leader, cpuctx, ctx); + if (leader->attr.pinned) { + update_group_times(leader); + leader->state = PERF_EVENT_STATE_ERROR; + } + } + + unlock: + spin_unlock(&ctx->lock); +} + +/* + * Enable a event. + * + * If event->ctx is a cloned context, callers must make sure that + * every task struct that event->ctx->task could possibly point to + * remains valid. This condition is satisfied when called through + * perf_event_for_each_child or perf_event_for_each as described + * for perf_event_disable. + */ +static void perf_event_enable(struct perf_event *event) +{ + struct perf_event_context *ctx = event->ctx; + struct task_struct *task = ctx->task; + + if (!task) { + /* + * Enable the event on the cpu that it's on + */ + smp_call_function_single(event->cpu, __perf_event_enable, + event, 1); + return; + } + + spin_lock_irq(&ctx->lock); + if (event->state >= PERF_EVENT_STATE_INACTIVE) + goto out; + + /* + * If the event is in error state, clear that first. + * That way, if we see the event in error state below, we + * know that it has gone back into error state, as distinct + * from the task having been scheduled away before the + * cross-call arrived. + */ + if (event->state == PERF_EVENT_STATE_ERROR) + event->state = PERF_EVENT_STATE_OFF; + + retry: + spin_unlock_irq(&ctx->lock); + task_oncpu_function_call(task, __perf_event_enable, event); + + spin_lock_irq(&ctx->lock); + + /* + * If the context is active and the event is still off, + * we need to retry the cross-call. + */ + if (ctx->is_active && event->state == PERF_EVENT_STATE_OFF) + goto retry; + + /* + * Since we have the lock this context can't be scheduled + * in, so we can change the state safely. + */ + if (event->state == PERF_EVENT_STATE_OFF) + __perf_event_mark_enabled(event, ctx); + + out: + spin_unlock_irq(&ctx->lock); +} + +static int perf_event_refresh(struct perf_event *event, int refresh) +{ + /* + * not supported on inherited events + */ + if (event->attr.inherit) + return -EINVAL; + + atomic_add(refresh, &event->event_limit); + perf_event_enable(event); + + return 0; +} + +void __perf_event_sched_out(struct perf_event_context *ctx, + struct perf_cpu_context *cpuctx) +{ + struct perf_event *event; + + spin_lock(&ctx->lock); + ctx->is_active = 0; + if (likely(!ctx->nr_events)) + goto out; + update_context_time(ctx); + + perf_disable(); + if (ctx->nr_active) { + list_for_each_entry(event, &ctx->group_list, group_entry) { + if (event != event->group_leader) + event_sched_out(event, cpuctx, ctx); + else + group_sched_out(event, cpuctx, ctx); + } + } + perf_enable(); + out: + spin_unlock(&ctx->lock); +} + +/* + * Test whether two contexts are equivalent, i.e. whether they + * have both been cloned from the same version of the same context + * and they both have the same number of enabled events. + * If the number of enabled events is the same, then the set + * of enabled events should be the same, because these are both + * inherited contexts, therefore we can't access individual events + * in them directly with an fd; we can only enable/disable all + * events via prctl, or enable/disable all events in a family + * via ioctl, which will have the same effect on both contexts. + */ +static int context_equiv(struct perf_event_context *ctx1, + struct perf_event_context *ctx2) +{ + return ctx1->parent_ctx && ctx1->parent_ctx == ctx2->parent_ctx + && ctx1->parent_gen == ctx2->parent_gen + && !ctx1->pin_count && !ctx2->pin_count; +} + +static void __perf_event_read(void *event); + +static void __perf_event_sync_stat(struct perf_event *event, + struct perf_event *next_event) +{ + u64 value; + + if (!event->attr.inherit_stat) + return; + + /* + * Update the event value, we cannot use perf_event_read() + * because we're in the middle of a context switch and have IRQs + * disabled, which upsets smp_call_function_single(), however + * we know the event must be on the current CPU, therefore we + * don't need to use it. + */ + switch (event->state) { + case PERF_EVENT_STATE_ACTIVE: + __perf_event_read(event); + break; + + case PERF_EVENT_STATE_INACTIVE: + update_event_times(event); + break; + + default: + break; + } + + /* + * In order to keep per-task stats reliable we need to flip the event + * values when we flip the contexts. + */ + value = atomic64_read(&next_event->count); + value = atomic64_xchg(&event->count, value); + atomic64_set(&next_event->count, value); + + swap(event->total_time_enabled, next_event->total_time_enabled); + swap(event->total_time_running, next_event->total_time_running); + + /* + * Since we swizzled the values, update the user visible data too. + */ + perf_event_update_userpage(event); + perf_event_update_userpage(next_event); +} + +#define list_next_entry(pos, member) \ + list_entry(pos->member.next, typeof(*pos), member) + +static void perf_event_sync_stat(struct perf_event_context *ctx, + struct perf_event_context *next_ctx) +{ + struct perf_event *event, *next_event; + + if (!ctx->nr_stat) + return; + + event = list_first_entry(&ctx->event_list, + struct perf_event, event_entry); + + next_event = list_first_entry(&next_ctx->event_list, + struct perf_event, event_entry); + + while (&event->event_entry != &ctx->event_list && + &next_event->event_entry != &next_ctx->event_list) { + + __perf_event_sync_stat(event, next_event); + + event = list_next_entry(event, event_entry); + next_event = list_next_entry(next_event, event_entry); + } +} + +/* + * Called from scheduler to remove the events of the current task, + * with interrupts disabled. + * + * We stop each event and update the event value in event->count. + * + * This does not protect us against NMI, but disable() + * sets the disabled bit in the control field of event _before_ + * accessing the event control register. If a NMI hits, then it will + * not restart the event. + */ +void perf_event_task_sched_out(struct task_struct *task, + struct task_struct *next, int cpu) +{ + struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu); + struct perf_event_context *ctx = task->perf_event_ctxp; + struct perf_event_context *next_ctx; + struct perf_event_context *parent; + struct pt_regs *regs; + int do_switch = 1; + + regs = task_pt_regs(task); + perf_sw_event(PERF_COUNT_SW_CONTEXT_SWITCHES, 1, 1, regs, 0); + + if (likely(!ctx || !cpuctx->task_ctx)) + return; + + update_context_time(ctx); + + rcu_read_lock(); + parent = rcu_dereference(ctx->parent_ctx); + next_ctx = next->perf_event_ctxp; + if (parent && next_ctx && + rcu_dereference(next_ctx->parent_ctx) == parent) { + /* + * Looks like the two contexts are clones, so we might be + * able to optimize the context switch. We lock both + * contexts and check that they are clones under the + * lock (including re-checking that neither has been + * uncloned in the meantime). It doesn't matter which + * order we take the locks because no other cpu could + * be trying to lock both of these tasks. + */ + spin_lock(&ctx->lock); + spin_lock_nested(&next_ctx->lock, SINGLE_DEPTH_NESTING); + if (context_equiv(ctx, next_ctx)) { + /* + * XXX do we need a memory barrier of sorts + * wrt to rcu_dereference() of perf_event_ctxp + */ + task->perf_event_ctxp = next_ctx; + next->perf_event_ctxp = ctx; + ctx->task = next; + next_ctx->task = task; + do_switch = 0; + + perf_event_sync_stat(ctx, next_ctx); + } + spin_unlock(&next_ctx->lock); + spin_unlock(&ctx->lock); + } + rcu_read_unlock(); + + if (do_switch) { + __perf_event_sched_out(ctx, cpuctx); + cpuctx->task_ctx = NULL; + } +} + +/* + * Called with IRQs disabled + */ +static void __perf_event_task_sched_out(struct perf_event_context *ctx) +{ + struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); + + if (!cpuctx->task_ctx) + return; + + if (WARN_ON_ONCE(ctx != cpuctx->task_ctx)) + return; + + __perf_event_sched_out(ctx, cpuctx); + cpuctx->task_ctx = NULL; +} + +/* + * Called with IRQs disabled + */ +static void perf_event_cpu_sched_out(struct perf_cpu_context *cpuctx) +{ + __perf_event_sched_out(&cpuctx->ctx, cpuctx); +} + +static void +__perf_event_sched_in(struct perf_event_context *ctx, + struct perf_cpu_context *cpuctx, int cpu) +{ + struct perf_event *event; + int can_add_hw = 1; + + spin_lock(&ctx->lock); + ctx->is_active = 1; + if (likely(!ctx->nr_events)) + goto out; + + ctx->timestamp = perf_clock(); + + perf_disable(); + + /* + * First go through the list and put on any pinned groups + * in order to give them the best chance of going on. + */ + list_for_each_entry(event, &ctx->group_list, group_entry) { + if (event->state <= PERF_EVENT_STATE_OFF || + !event->attr.pinned) + continue; + if (event->cpu != -1 && event->cpu != cpu) + continue; + + if (event != event->group_leader) + event_sched_in(event, cpuctx, ctx, cpu); + else { + if (group_can_go_on(event, cpuctx, 1)) + group_sched_in(event, cpuctx, ctx, cpu); + } + + /* + * If this pinned group hasn't been scheduled, + * put it in error state. + */ + if (event->state == PERF_EVENT_STATE_INACTIVE) { + update_group_times(event); + event->state = PERF_EVENT_STATE_ERROR; + } + } + + list_for_each_entry(event, &ctx->group_list, group_entry) { + /* + * Ignore events in OFF or ERROR state, and + * ignore pinned events since we did them already. + */ + if (event->state <= PERF_EVENT_STATE_OFF || + event->attr.pinned) + continue; + + /* + * Listen to the 'cpu' scheduling filter constraint + * of events: + */ + if (event->cpu != -1 && event->cpu != cpu) + continue; + + if (event != event->group_leader) { + if (event_sched_in(event, cpuctx, ctx, cpu)) + can_add_hw = 0; + } else { + if (group_can_go_on(event, cpuctx, can_add_hw)) { + if (group_sched_in(event, cpuctx, ctx, cpu)) + can_add_hw = 0; + } + } + } + perf_enable(); + out: + spin_unlock(&ctx->lock); +} + +/* + * Called from scheduler to add the events of the current task + * with interrupts disabled. + * + * We restore the event value and then enable it. + * + * This does not protect us against NMI, but enable() + * sets the enabled bit in the control field of event _before_ + * accessing the event control register. If a NMI hits, then it will + * keep the event running. + */ +void perf_event_task_sched_in(struct task_struct *task, int cpu) +{ + struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu); + struct perf_event_context *ctx = task->perf_event_ctxp; + + if (likely(!ctx)) + return; + if (cpuctx->task_ctx == ctx) + return; + __perf_event_sched_in(ctx, cpuctx, cpu); + cpuctx->task_ctx = ctx; +} + +static void perf_event_cpu_sched_in(struct perf_cpu_context *cpuctx, int cpu) +{ + struct perf_event_context *ctx = &cpuctx->ctx; + + __perf_event_sched_in(ctx, cpuctx, cpu); +} + +#define MAX_INTERRUPTS (~0ULL) + +static void perf_log_throttle(struct perf_event *event, int enable); + +static void perf_adjust_period(struct perf_event *event, u64 events) +{ + struct hw_perf_event *hwc = &event->hw; + u64 period, sample_period; + s64 delta; + + events *= hwc->sample_period; + period = div64_u64(events, event->attr.sample_freq); + + delta = (s64)(period - hwc->sample_period); + delta = (delta + 7) / 8; /* low pass filter */ + + sample_period = hwc->sample_period + delta; + + if (!sample_period) + sample_period = 1; + + hwc->sample_period = sample_period; +} + +static void perf_ctx_adjust_freq(struct perf_event_context *ctx) +{ + struct perf_event *event; + struct hw_perf_event *hwc; + u64 interrupts, freq; + + spin_lock(&ctx->lock); + list_for_each_entry(event, &ctx->group_list, group_entry) { + if (event->state != PERF_EVENT_STATE_ACTIVE) + continue; + + hwc = &event->hw; + + interrupts = hwc->interrupts; + hwc->interrupts = 0; + + /* + * unthrottle events on the tick + */ + if (interrupts == MAX_INTERRUPTS) { + perf_log_throttle(event, 1); + event->pmu->unthrottle(event); + interrupts = 2*sysctl_perf_event_sample_rate/HZ; + } + + if (!event->attr.freq || !event->attr.sample_freq) + continue; + + /* + * if the specified freq < HZ then we need to skip ticks + */ + if (event->attr.sample_freq < HZ) { + freq = event->attr.sample_freq; + + hwc->freq_count += freq; + hwc->freq_interrupts += interrupts; + + if (hwc->freq_count < HZ) + continue; + + interrupts = hwc->freq_interrupts; + hwc->freq_interrupts = 0; + hwc->freq_count -= HZ; + } else + freq = HZ; + + perf_adjust_period(event, freq * interrupts); + + /* + * In order to avoid being stalled by an (accidental) huge + * sample period, force reset the sample period if we didn't + * get any events in this freq period. + */ + if (!interrupts) { + perf_disable(); + event->pmu->disable(event); + atomic64_set(&hwc->period_left, 0); + event->pmu->enable(event); + perf_enable(); + } + } + spin_unlock(&ctx->lock); +} + +/* + * Round-robin a context's events: + */ +static void rotate_ctx(struct perf_event_context *ctx) +{ + struct perf_event *event; + + if (!ctx->nr_events) + return; + + spin_lock(&ctx->lock); + /* + * Rotate the first entry last (works just fine for group events too): + */ + perf_disable(); + list_for_each_entry(event, &ctx->group_list, group_entry) { + list_move_tail(&event->group_entry, &ctx->group_list); + break; + } + perf_enable(); + + spin_unlock(&ctx->lock); +} + +void perf_event_task_tick(struct task_struct *curr, int cpu) +{ + struct perf_cpu_context *cpuctx; + struct perf_event_context *ctx; + + if (!atomic_read(&nr_events)) + return; + + cpuctx = &per_cpu(perf_cpu_context, cpu); + ctx = curr->perf_event_ctxp; + + perf_ctx_adjust_freq(&cpuctx->ctx); + if (ctx) + perf_ctx_adjust_freq(ctx); + + perf_event_cpu_sched_out(cpuctx); + if (ctx) + __perf_event_task_sched_out(ctx); + + rotate_ctx(&cpuctx->ctx); + if (ctx) + rotate_ctx(ctx); + + perf_event_cpu_sched_in(cpuctx, cpu); + if (ctx) + perf_event_task_sched_in(curr, cpu); +} + +/* + * Enable all of a task's events that have been marked enable-on-exec. + * This expects task == current. + */ +static void perf_event_enable_on_exec(struct task_struct *task) +{ + struct perf_event_context *ctx; + struct perf_event *event; + unsigned long flags; + int enabled = 0; + + local_irq_save(flags); + ctx = task->perf_event_ctxp; + if (!ctx || !ctx->nr_events) + goto out; + + __perf_event_task_sched_out(ctx); + + spin_lock(&ctx->lock); + + list_for_each_entry(event, &ctx->group_list, group_entry) { + if (!event->attr.enable_on_exec) + continue; + event->attr.enable_on_exec = 0; + if (event->state >= PERF_EVENT_STATE_INACTIVE) + continue; + __perf_event_mark_enabled(event, ctx); + enabled = 1; + } + + /* + * Unclone this context if we enabled any event. + */ + if (enabled) + unclone_ctx(ctx); + + spin_unlock(&ctx->lock); + + perf_event_task_sched_in(task, smp_processor_id()); + out: + local_irq_restore(flags); +} + +/* + * Cross CPU call to read the hardware event + */ +static void __perf_event_read(void *info) +{ + struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); + struct perf_event *event = info; + struct perf_event_context *ctx = event->ctx; + unsigned long flags; + + /* + * If this is a task context, we need to check whether it is + * the current task context of this cpu. If not it has been + * scheduled out before the smp call arrived. In that case + * event->count would have been updated to a recent sample + * when the event was scheduled out. + */ + if (ctx->task && cpuctx->task_ctx != ctx) + return; + + local_irq_save(flags); + if (ctx->is_active) + update_context_time(ctx); + event->pmu->read(event); + update_event_times(event); + local_irq_restore(flags); +} + +static u64 perf_event_read(struct perf_event *event) +{ + /* + * If event is enabled and currently active on a CPU, update the + * value in the event structure: + */ + if (event->state == PERF_EVENT_STATE_ACTIVE) { + smp_call_function_single(event->oncpu, + __perf_event_read, event, 1); + } else if (event->state == PERF_EVENT_STATE_INACTIVE) { + update_event_times(event); + } + + return atomic64_read(&event->count); +} + +/* + * Initialize the perf_event context in a task_struct: + */ +static void +__perf_event_init_context(struct perf_event_context *ctx, + struct task_struct *task) +{ + memset(ctx, 0, sizeof(*ctx)); + spin_lock_init(&ctx->lock); + mutex_init(&ctx->mutex); + INIT_LIST_HEAD(&ctx->group_list); + INIT_LIST_HEAD(&ctx->event_list); + atomic_set(&ctx->refcount, 1); + ctx->task = task; +} + +static struct perf_event_context *find_get_context(pid_t pid, int cpu) +{ + struct perf_event_context *ctx; + struct perf_cpu_context *cpuctx; + struct task_struct *task; + unsigned long flags; + int err; + + /* + * If cpu is not a wildcard then this is a percpu event: + */ + if (cpu != -1) { + /* Must be root to operate on a CPU event: */ + if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN)) + return ERR_PTR(-EACCES); + + if (cpu < 0 || cpu > num_possible_cpus()) + return ERR_PTR(-EINVAL); + + /* + * We could be clever and allow to attach a event to an + * offline CPU and activate it when the CPU comes up, but + * that's for later. + */ + if (!cpu_isset(cpu, cpu_online_map)) + return ERR_PTR(-ENODEV); + + cpuctx = &per_cpu(perf_cpu_context, cpu); + ctx = &cpuctx->ctx; + get_ctx(ctx); + + return ctx; + } + + rcu_read_lock(); + if (!pid) + task = current; + else + task = find_task_by_vpid(pid); + if (task) + get_task_struct(task); + rcu_read_unlock(); + + if (!task) + return ERR_PTR(-ESRCH); + + /* + * Can't attach events to a dying task. + */ + err = -ESRCH; + if (task->flags & PF_EXITING) + goto errout; + + /* Reuse ptrace permission checks for now. */ + err = -EACCES; + if (!ptrace_may_access(task, PTRACE_MODE_READ)) + goto errout; + + retry: + ctx = perf_lock_task_context(task, &flags); + if (ctx) { + unclone_ctx(ctx); + spin_unlock_irqrestore(&ctx->lock, flags); + } + + if (!ctx) { + ctx = kmalloc(sizeof(struct perf_event_context), GFP_KERNEL); + err = -ENOMEM; + if (!ctx) + goto errout; + __perf_event_init_context(ctx, task); + get_ctx(ctx); + if (cmpxchg(&task->perf_event_ctxp, NULL, ctx)) { + /* + * We raced with some other task; use + * the context they set. + */ + kfree(ctx); + goto retry; + } + get_task_struct(task); + } + + put_task_struct(task); + return ctx; + + errout: + put_task_struct(task); + return ERR_PTR(err); +} + +static void free_event_rcu(struct rcu_head *head) +{ + struct perf_event *event; + + event = container_of(head, struct perf_event, rcu_head); + if (event->ns) + put_pid_ns(event->ns); + kfree(event); +} + +static void perf_pending_sync(struct perf_event *event); + +static void free_event(struct perf_event *event) +{ + perf_pending_sync(event); + + if (!event->parent) { + atomic_dec(&nr_events); + if (event->attr.mmap) + atomic_dec(&nr_mmap_events); + if (event->attr.comm) + atomic_dec(&nr_comm_events); + if (event->attr.task) + atomic_dec(&nr_task_events); + } + + if (event->output) { + fput(event->output->filp); + event->output = NULL; + } + + if (event->destroy) + event->destroy(event); + + put_ctx(event->ctx); + call_rcu(&event->rcu_head, free_event_rcu); +} + +/* + * Called when the last reference to the file is gone. + */ +static int perf_release(struct inode *inode, struct file *file) +{ + struct perf_event *event = file->private_data; + struct perf_event_context *ctx = event->ctx; + + file->private_data = NULL; + + WARN_ON_ONCE(ctx->parent_ctx); + mutex_lock(&ctx->mutex); + perf_event_remove_from_context(event); + mutex_unlock(&ctx->mutex); + + mutex_lock(&event->owner->perf_event_mutex); + list_del_init(&event->owner_entry); + mutex_unlock(&event->owner->perf_event_mutex); + put_task_struct(event->owner); + + free_event(event); + + return 0; +} + +static int perf_event_read_size(struct perf_event *event) +{ + int entry = sizeof(u64); /* value */ + int size = 0; + int nr = 1; + + if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) + size += sizeof(u64); + + if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) + size += sizeof(u64); + + if (event->attr.read_format & PERF_FORMAT_ID) + entry += sizeof(u64); + + if (event->attr.read_format & PERF_FORMAT_GROUP) { + nr += event->group_leader->nr_siblings; + size += sizeof(u64); + } + + size += entry * nr; + + return size; +} + +static u64 perf_event_read_value(struct perf_event *event) +{ + struct perf_event *child; + u64 total = 0; + + total += perf_event_read(event); + list_for_each_entry(child, &event->child_list, child_list) + total += perf_event_read(child); + + return total; +} + +static int perf_event_read_entry(struct perf_event *event, + u64 read_format, char __user *buf) +{ + int n = 0, count = 0; + u64 values[2]; + + values[n++] = perf_event_read_value(event); + if (read_format & PERF_FORMAT_ID) + values[n++] = primary_event_id(event); + + count = n * sizeof(u64); + + if (copy_to_user(buf, values, count)) + return -EFAULT; + + return count; +} + +static int perf_event_read_group(struct perf_event *event, + u64 read_format, char __user *buf) +{ + struct perf_event *leader = event->group_leader, *sub; + int n = 0, size = 0, err = -EFAULT; + u64 values[3]; + + values[n++] = 1 + leader->nr_siblings; + if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) { + values[n++] = leader->total_time_enabled + + atomic64_read(&leader->child_total_time_enabled); + } + if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) { + values[n++] = leader->total_time_running + + atomic64_read(&leader->child_total_time_running); + } + + size = n * sizeof(u64); + + if (copy_to_user(buf, values, size)) + return -EFAULT; + + err = perf_event_read_entry(leader, read_format, buf + size); + if (err < 0) + return err; + + size += err; + + list_for_each_entry(sub, &leader->sibling_list, group_entry) { + err = perf_event_read_entry(sub, read_format, + buf + size); + if (err < 0) + return err; + + size += err; + } + + return size; +} + +static int perf_event_read_one(struct perf_event *event, + u64 read_format, char __user *buf) +{ + u64 values[4]; + int n = 0; + + values[n++] = perf_event_read_value(event); + if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) { + values[n++] = event->total_time_enabled + + atomic64_read(&event->child_total_time_enabled); + } + if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) { + values[n++] = event->total_time_running + + atomic64_read(&event->child_total_time_running); + } + if (read_format & PERF_FORMAT_ID) + values[n++] = primary_event_id(event); + + if (copy_to_user(buf, values, n * sizeof(u64))) + return -EFAULT; + + return n * sizeof(u64); +} + +/* + * Read the performance event - simple non blocking version for now + */ +static ssize_t +perf_read_hw(struct perf_event *event, char __user *buf, size_t count) +{ + u64 read_format = event->attr.read_format; + int ret; + + /* + * Return end-of-file for a read on a event that is in + * error state (i.e. because it was pinned but it couldn't be + * scheduled on to the CPU at some point). + */ + if (event->state == PERF_EVENT_STATE_ERROR) + return 0; + + if (count < perf_event_read_size(event)) + return -ENOSPC; + + WARN_ON_ONCE(event->ctx->parent_ctx); + mutex_lock(&event->child_mutex); + if (read_format & PERF_FORMAT_GROUP) + ret = perf_event_read_group(event, read_format, buf); + else + ret = perf_event_read_one(event, read_format, buf); + mutex_unlock(&event->child_mutex); + + return ret; +} + +static ssize_t +perf_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) +{ + struct perf_event *event = file->private_data; + + return perf_read_hw(event, buf, count); +} + +static unsigned int perf_poll(struct file *file, poll_table *wait) +{ + struct perf_event *event = file->private_data; + struct perf_mmap_data *data; + unsigned int events = POLL_HUP; + + rcu_read_lock(); + data = rcu_dereference(event->data); + if (data) + events = atomic_xchg(&data->poll, 0); + rcu_read_unlock(); + + poll_wait(file, &event->waitq, wait); + + return events; +} + +static void perf_event_reset(struct perf_event *event) +{ + (void)perf_event_read(event); + atomic64_set(&event->count, 0); + perf_event_update_userpage(event); +} + +/* + * Holding the top-level event's child_mutex means that any + * descendant process that has inherited this event will block + * in sync_child_event if it goes to exit, thus satisfying the + * task existence requirements of perf_event_enable/disable. + */ +static void perf_event_for_each_child(struct perf_event *event, + void (*func)(struct perf_event *)) +{ + struct perf_event *child; + + WARN_ON_ONCE(event->ctx->parent_ctx); + mutex_lock(&event->child_mutex); + func(event); + list_for_each_entry(child, &event->child_list, child_list) + func(child); + mutex_unlock(&event->child_mutex); +} + +static void perf_event_for_each(struct perf_event *event, + void (*func)(struct perf_event *)) +{ + struct perf_event_context *ctx = event->ctx; + struct perf_event *sibling; + + WARN_ON_ONCE(ctx->parent_ctx); + mutex_lock(&ctx->mutex); + event = event->group_leader; + + perf_event_for_each_child(event, func); + func(event); + list_for_each_entry(sibling, &event->sibling_list, group_entry) + perf_event_for_each_child(event, func); + mutex_unlock(&ctx->mutex); +} + +static int perf_event_period(struct perf_event *event, u64 __user *arg) +{ + struct perf_event_context *ctx = event->ctx; + unsigned long size; + int ret = 0; + u64 value; + + if (!event->attr.sample_period) + return -EINVAL; + + size = copy_from_user(&value, arg, sizeof(value)); + if (size != sizeof(value)) + return -EFAULT; + + if (!value) + return -EINVAL; + + spin_lock_irq(&ctx->lock); + if (event->attr.freq) { + if (value > sysctl_perf_event_sample_rate) { + ret = -EINVAL; + goto unlock; + } + + event->attr.sample_freq = value; + } else { + event->attr.sample_period = value; + event->hw.sample_period = value; + } +unlock: + spin_unlock_irq(&ctx->lock); + + return ret; +} + +int perf_event_set_output(struct perf_event *event, int output_fd); + +static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg) +{ + struct perf_event *event = file->private_data; + void (*func)(struct perf_event *); + u32 flags = arg; + + switch (cmd) { + case PERF_EVENT_IOC_ENABLE: + func = perf_event_enable; + break; + case PERF_EVENT_IOC_DISABLE: + func = perf_event_disable; + break; + case PERF_EVENT_IOC_RESET: + func = perf_event_reset; + break; + + case PERF_EVENT_IOC_REFRESH: + return perf_event_refresh(event, arg); + + case PERF_EVENT_IOC_PERIOD: + return perf_event_period(event, (u64 __user *)arg); + + case PERF_EVENT_IOC_SET_OUTPUT: + return perf_event_set_output(event, arg); + + default: + return -ENOTTY; + } + + if (flags & PERF_IOC_FLAG_GROUP) + perf_event_for_each(event, func); + else + perf_event_for_each_child(event, func); + + return 0; +} + +int perf_event_task_enable(void) +{ + struct perf_event *event; + + mutex_lock(¤t->perf_event_mutex); + list_for_each_entry(event, ¤t->perf_event_list, owner_entry) + perf_event_for_each_child(event, perf_event_enable); + mutex_unlock(¤t->perf_event_mutex); + + return 0; +} + +int perf_event_task_disable(void) +{ + struct perf_event *event; + + mutex_lock(¤t->perf_event_mutex); + list_for_each_entry(event, ¤t->perf_event_list, owner_entry) + perf_event_for_each_child(event, perf_event_disable); + mutex_unlock(¤t->perf_event_mutex); + + return 0; +} + +#ifndef PERF_EVENT_INDEX_OFFSET +# define PERF_EVENT_INDEX_OFFSET 0 +#endif + +static int perf_event_index(struct perf_event *event) +{ + if (event->state != PERF_EVENT_STATE_ACTIVE) + return 0; + + return event->hw.idx + 1 - PERF_EVENT_INDEX_OFFSET; +} + +/* + * Callers need to ensure there can be no nesting of this function, otherwise + * the seqlock logic goes bad. We can not serialize this because the arch + * code calls this from NMI context. + */ +void perf_event_update_userpage(struct perf_event *event) +{ + struct perf_event_mmap_page *userpg; + struct perf_mmap_data *data; + + rcu_read_lock(); + data = rcu_dereference(event->data); + if (!data) + goto unlock; + + userpg = data->user_page; + + /* + * Disable preemption so as to not let the corresponding user-space + * spin too long if we get preempted. + */ + preempt_disable(); + ++userpg->lock; + barrier(); + userpg->index = perf_event_index(event); + userpg->offset = atomic64_read(&event->count); + if (event->state == PERF_EVENT_STATE_ACTIVE) + userpg->offset -= atomic64_read(&event->hw.prev_count); + + userpg->time_enabled = event->total_time_enabled + + atomic64_read(&event->child_total_time_enabled); + + userpg->time_running = event->total_time_running + + atomic64_read(&event->child_total_time_running); + + barrier(); + ++userpg->lock; + preempt_enable(); +unlock: + rcu_read_unlock(); +} + +static int perf_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) +{ + struct perf_event *event = vma->vm_file->private_data; + struct perf_mmap_data *data; + int ret = VM_FAULT_SIGBUS; + + if (vmf->flags & FAULT_FLAG_MKWRITE) { + if (vmf->pgoff == 0) + ret = 0; + return ret; + } + + rcu_read_lock(); + data = rcu_dereference(event->data); + if (!data) + goto unlock; + + if (vmf->pgoff == 0) { + vmf->page = virt_to_page(data->user_page); + } else { + int nr = vmf->pgoff - 1; + + if ((unsigned)nr > data->nr_pages) + goto unlock; + + if (vmf->flags & FAULT_FLAG_WRITE) + goto unlock; + + vmf->page = virt_to_page(data->data_pages[nr]); + } + + get_page(vmf->page); + vmf->page->mapping = vma->vm_file->f_mapping; + vmf->page->index = vmf->pgoff; + + ret = 0; +unlock: + rcu_read_unlock(); + + return ret; +} + +static int perf_mmap_data_alloc(struct perf_event *event, int nr_pages) +{ + struct perf_mmap_data *data; + unsigned long size; + int i; + + WARN_ON(atomic_read(&event->mmap_count)); + + size = sizeof(struct perf_mmap_data); + size += nr_pages * sizeof(void *); + + data = kzalloc(size, GFP_KERNEL); + if (!data) + goto fail; + + data->user_page = (void *)get_zeroed_page(GFP_KERNEL); + if (!data->user_page) + goto fail_user_page; + + for (i = 0; i < nr_pages; i++) { + data->data_pages[i] = (void *)get_zeroed_page(GFP_KERNEL); + if (!data->data_pages[i]) + goto fail_data_pages; + } + + data->nr_pages = nr_pages; + atomic_set(&data->lock, -1); + + if (event->attr.watermark) { + data->watermark = min_t(long, PAGE_SIZE * nr_pages, + event->attr.wakeup_watermark); + } + if (!data->watermark) + data->watermark = max(PAGE_SIZE, PAGE_SIZE * nr_pages / 4); + + rcu_assign_pointer(event->data, data); + + return 0; + +fail_data_pages: + for (i--; i >= 0; i--) + free_page((unsigned long)data->data_pages[i]); + + free_page((unsigned long)data->user_page); + +fail_user_page: + kfree(data); + +fail: + return -ENOMEM; +} + +static void perf_mmap_free_page(unsigned long addr) +{ + struct page *page = virt_to_page((void *)addr); + + page->mapping = NULL; + __free_page(page); +} + +static void __perf_mmap_data_free(struct rcu_head *rcu_head) +{ + struct perf_mmap_data *data; + int i; + + data = container_of(rcu_head, struct perf_mmap_data, rcu_head); + + perf_mmap_free_page((unsigned long)data->user_page); + for (i = 0; i < data->nr_pages; i++) + perf_mmap_free_page((unsigned long)data->data_pages[i]); + + kfree(data); +} + +static void perf_mmap_data_free(struct perf_event *event) +{ + struct perf_mmap_data *data = event->data; + + WARN_ON(atomic_read(&event->mmap_count)); + + rcu_assign_pointer(event->data, NULL); + call_rcu(&data->rcu_head, __perf_mmap_data_free); +} + +static void perf_mmap_open(struct vm_area_struct *vma) +{ + struct perf_event *event = vma->vm_file->private_data; + + atomic_inc(&event->mmap_count); +} + +static void perf_mmap_close(struct vm_area_struct *vma) +{ + struct perf_event *event = vma->vm_file->private_data; + + WARN_ON_ONCE(event->ctx->parent_ctx); + if (atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex)) { + struct user_struct *user = current_user(); + + atomic_long_sub(event->data->nr_pages + 1, &user->locked_vm); + vma->vm_mm->locked_vm -= event->data->nr_locked; + perf_mmap_data_free(event); + mutex_unlock(&event->mmap_mutex); + } +} + +static struct vm_operations_struct perf_mmap_vmops = { + .open = perf_mmap_open, + .close = perf_mmap_close, + .fault = perf_mmap_fault, + .page_mkwrite = perf_mmap_fault, +}; + +static int perf_mmap(struct file *file, struct vm_area_struct *vma) +{ + struct perf_event *event = file->private_data; + unsigned long user_locked, user_lock_limit; + struct user_struct *user = current_user(); + unsigned long locked, lock_limit; + unsigned long vma_size; + unsigned long nr_pages; + long user_extra, extra; + int ret = 0; + + if (!(vma->vm_flags & VM_SHARED)) + return -EINVAL; + + vma_size = vma->vm_end - vma->vm_start; + nr_pages = (vma_size / PAGE_SIZE) - 1; + + /* + * If we have data pages ensure they're a power-of-two number, so we + * can do bitmasks instead of modulo. + */ + if (nr_pages != 0 && !is_power_of_2(nr_pages)) + return -EINVAL; + + if (vma_size != PAGE_SIZE * (1 + nr_pages)) + return -EINVAL; + + if (vma->vm_pgoff != 0) + return -EINVAL; + + WARN_ON_ONCE(event->ctx->parent_ctx); + mutex_lock(&event->mmap_mutex); + if (event->output) { + ret = -EINVAL; + goto unlock; + } + + if (atomic_inc_not_zero(&event->mmap_count)) { + if (nr_pages != event->data->nr_pages) + ret = -EINVAL; + goto unlock; + } + + user_extra = nr_pages + 1; + user_lock_limit = sysctl_perf_event_mlock >> (PAGE_SHIFT - 10); + + /* + * Increase the limit linearly with more CPUs: + */ + user_lock_limit *= num_online_cpus(); + + user_locked = atomic_long_read(&user->locked_vm) + user_extra; + + extra = 0; + if (user_locked > user_lock_limit) + extra = user_locked - user_lock_limit; + + lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur; + lock_limit >>= PAGE_SHIFT; + locked = vma->vm_mm->locked_vm + extra; + + if ((locked > lock_limit) && perf_paranoid_tracepoint_raw() && + !capable(CAP_IPC_LOCK)) { + ret = -EPERM; + goto unlock; + } + + WARN_ON(event->data); + ret = perf_mmap_data_alloc(event, nr_pages); + if (ret) + goto unlock; + + atomic_set(&event->mmap_count, 1); + atomic_long_add(user_extra, &user->locked_vm); + vma->vm_mm->locked_vm += extra; + event->data->nr_locked = extra; + if (vma->vm_flags & VM_WRITE) + event->data->writable = 1; + +unlock: + mutex_unlock(&event->mmap_mutex); + + vma->vm_flags |= VM_RESERVED; + vma->vm_ops = &perf_mmap_vmops; + + return ret; +} + +static int perf_fasync(int fd, struct file *filp, int on) +{ + struct inode *inode = filp->f_path.dentry->d_inode; + struct perf_event *event = filp->private_data; + int retval; + + mutex_lock(&inode->i_mutex); + retval = fasync_helper(fd, filp, on, &event->fasync); + mutex_unlock(&inode->i_mutex); + + if (retval < 0) + return retval; + + return 0; +} + +static const struct file_operations perf_fops = { + .release = perf_release, + .read = perf_read, + .poll = perf_poll, + .unlocked_ioctl = perf_ioctl, + .compat_ioctl = perf_ioctl, + .mmap = perf_mmap, + .fasync = perf_fasync, +}; + +/* + * Perf event wakeup + * + * If there's data, ensure we set the poll() state and publish everything + * to user-space before waking everybody up. + */ + +void perf_event_wakeup(struct perf_event *event) +{ + wake_up_all(&event->waitq); + + if (event->pending_kill) { + kill_fasync(&event->fasync, SIGIO, event->pending_kill); + event->pending_kill = 0; + } +} + +/* + * Pending wakeups + * + * Handle the case where we need to wakeup up from NMI (or rq->lock) context. + * + * The NMI bit means we cannot possibly take locks. Therefore, maintain a + * single linked list and use cmpxchg() to add entries lockless. + */ + +static void perf_pending_event(struct perf_pending_entry *entry) +{ + struct perf_event *event = container_of(entry, + struct perf_event, pending); + + if (event->pending_disable) { + event->pending_disable = 0; + __perf_event_disable(event); + } + + if (event->pending_wakeup) { + event->pending_wakeup = 0; + perf_event_wakeup(event); + } +} + +#define PENDING_TAIL ((struct perf_pending_entry *)-1UL) + +static DEFINE_PER_CPU(struct perf_pending_entry *, perf_pending_head) = { + PENDING_TAIL, +}; + +static void perf_pending_queue(struct perf_pending_entry *entry, + void (*func)(struct perf_pending_entry *)) +{ + struct perf_pending_entry **head; + + if (cmpxchg(&entry->next, NULL, PENDING_TAIL) != NULL) + return; + + entry->func = func; + + head = &get_cpu_var(perf_pending_head); + + do { + entry->next = *head; + } while (cmpxchg(head, entry->next, entry) != entry->next); + + set_perf_event_pending(); + + put_cpu_var(perf_pending_head); +} + +static int __perf_pending_run(void) +{ + struct perf_pending_entry *list; + int nr = 0; + + list = xchg(&__get_cpu_var(perf_pending_head), PENDING_TAIL); + while (list != PENDING_TAIL) { + void (*func)(struct perf_pending_entry *); + struct perf_pending_entry *entry = list; + + list = list->next; + + func = entry->func; + entry->next = NULL; + /* + * Ensure we observe the unqueue before we issue the wakeup, + * so that we won't be waiting forever. + * -- see perf_not_pending(). + */ + smp_wmb(); + + func(entry); + nr++; + } + + return nr; +} + +static inline int perf_not_pending(struct perf_event *event) +{ + /* + * If we flush on whatever cpu we run, there is a chance we don't + * need to wait. + */ + get_cpu(); + __perf_pending_run(); + put_cpu(); + + /* + * Ensure we see the proper queue state before going to sleep + * so that we do not miss the wakeup. -- see perf_pending_handle() + */ + smp_rmb(); + return event->pending.next == NULL; +} + +static void perf_pending_sync(struct perf_event *event) +{ + wait_event(event->waitq, perf_not_pending(event)); +} + +void perf_event_do_pending(void) +{ + __perf_pending_run(); +} + +/* + * Callchain support -- arch specific + */ + +__weak struct perf_callchain_entry *perf_callchain(struct pt_regs *regs) +{ + return NULL; +} + +/* + * Output + */ +static bool perf_output_space(struct perf_mmap_data *data, unsigned long tail, + unsigned long offset, unsigned long head) +{ + unsigned long mask; + + if (!data->writable) + return true; + + mask = (data->nr_pages << PAGE_SHIFT) - 1; + + offset = (offset - tail) & mask; + head = (head - tail) & mask; + + if ((int)(head - offset) < 0) + return false; + + return true; +} + +static void perf_output_wakeup(struct perf_output_handle *handle) +{ + atomic_set(&handle->data->poll, POLL_IN); + + if (handle->nmi) { + handle->event->pending_wakeup = 1; + perf_pending_queue(&handle->event->pending, + perf_pending_event); + } else + perf_event_wakeup(handle->event); +} + +/* + * Curious locking construct. + * + * We need to ensure a later event_id doesn't publish a head when a former + * event_id isn't done writing. However since we need to deal with NMIs we + * cannot fully serialize things. + * + * What we do is serialize between CPUs so we only have to deal with NMI + * nesting on a single CPU. + * + * We only publish the head (and generate a wakeup) when the outer-most + * event_id completes. + */ +static void perf_output_lock(struct perf_output_handle *handle) +{ + struct perf_mmap_data *data = handle->data; + int cpu; + + handle->locked = 0; + + local_irq_save(handle->flags); + cpu = smp_processor_id(); + + if (in_nmi() && atomic_read(&data->lock) == cpu) + return; + + while (atomic_cmpxchg(&data->lock, -1, cpu) != -1) + cpu_relax(); + + handle->locked = 1; +} + +static void perf_output_unlock(struct perf_output_handle *handle) +{ + struct perf_mmap_data *data = handle->data; + unsigned long head; + int cpu; + + data->done_head = data->head; + + if (!handle->locked) + goto out; + +again: + /* + * The xchg implies a full barrier that ensures all writes are done + * before we publish the new head, matched by a rmb() in userspace when + * reading this position. + */ + while ((head = atomic_long_xchg(&data->done_head, 0))) + data->user_page->data_head = head; + + /* + * NMI can happen here, which means we can miss a done_head update. + */ + + cpu = atomic_xchg(&data->lock, -1); + WARN_ON_ONCE(cpu != smp_processor_id()); + + /* + * Therefore we have to validate we did not indeed do so. + */ + if (unlikely(atomic_long_read(&data->done_head))) { + /* + * Since we had it locked, we can lock it again. + */ + while (atomic_cmpxchg(&data->lock, -1, cpu) != -1) + cpu_relax(); + + goto again; + } + + if (atomic_xchg(&data->wakeup, 0)) + perf_output_wakeup(handle); +out: + local_irq_restore(handle->flags); +} + +void perf_output_copy(struct perf_output_handle *handle, + const void *buf, unsigned int len) +{ + unsigned int pages_mask; + unsigned int offset; + unsigned int size; + void **pages; + + offset = handle->offset; + pages_mask = handle->data->nr_pages - 1; + pages = handle->data->data_pages; + + do { + unsigned int page_offset; + int nr; + + nr = (offset >> PAGE_SHIFT) & pages_mask; + page_offset = offset & (PAGE_SIZE - 1); + size = min_t(unsigned int, PAGE_SIZE - page_offset, len); + + memcpy(pages[nr] + page_offset, buf, size); + + len -= size; + buf += size; + offset += size; + } while (len); + + handle->offset = offset; + + /* + * Check we didn't copy past our reservation window, taking the + * possible unsigned int wrap into account. + */ + WARN_ON_ONCE(((long)(handle->head - handle->offset)) < 0); +} + +int perf_output_begin(struct perf_output_handle *handle, + struct perf_event *event, unsigned int size, + int nmi, int sample) +{ + struct perf_event *output_event; + struct perf_mmap_data *data; + unsigned long tail, offset, head; + int have_lost; + struct { + struct perf_event_header header; + u64 id; + u64 lost; + } lost_event; + + rcu_read_lock(); + /* + * For inherited events we send all the output towards the parent. + */ + if (event->parent) + event = event->parent; + + output_event = rcu_dereference(event->output); + if (output_event) + event = output_event; + + data = rcu_dereference(event->data); + if (!data) + goto out; + + handle->data = data; + handle->event = event; + handle->nmi = nmi; + handle->sample = sample; + + if (!data->nr_pages) + goto fail; + + have_lost = atomic_read(&data->lost); + if (have_lost) + size += sizeof(lost_event); + + perf_output_lock(handle); + + do { + /* + * Userspace could choose to issue a mb() before updating the + * tail pointer. So that all reads will be completed before the + * write is issued. + */ + tail = ACCESS_ONCE(data->user_page->data_tail); + smp_rmb(); + offset = head = atomic_long_read(&data->head); + head += size; + if (unlikely(!perf_output_space(data, tail, offset, head))) + goto fail; + } while (atomic_long_cmpxchg(&data->head, offset, head) != offset); + + handle->offset = offset; + handle->head = head; + + if (head - tail > data->watermark) + atomic_set(&data->wakeup, 1); + + if (have_lost) { + lost_event.header.type = PERF_RECORD_LOST; + lost_event.header.misc = 0; + lost_event.header.size = sizeof(lost_event); + lost_event.id = event->id; + lost_event.lost = atomic_xchg(&data->lost, 0); + + perf_output_put(handle, lost_event); + } + + return 0; + +fail: + atomic_inc(&data->lost); + perf_output_unlock(handle); +out: + rcu_read_unlock(); + + return -ENOSPC; +} + +void perf_output_end(struct perf_output_handle *handle) +{ + struct perf_event *event = handle->event; + struct perf_mmap_data *data = handle->data; + + int wakeup_events = event->attr.wakeup_events; + + if (handle->sample && wakeup_events) { + int events = atomic_inc_return(&data->events); + if (events >= wakeup_events) { + atomic_sub(wakeup_events, &data->events); + atomic_set(&data->wakeup, 1); + } + } + + perf_output_unlock(handle); + rcu_read_unlock(); +} + +static u32 perf_event_pid(struct perf_event *event, struct task_struct *p) +{ + /* + * only top level events have the pid namespace they were created in + */ + if (event->parent) + event = event->parent; + + return task_tgid_nr_ns(p, event->ns); +} + +static u32 perf_event_tid(struct perf_event *event, struct task_struct *p) +{ + /* + * only top level events have the pid namespace they were created in + */ + if (event->parent) + event = event->parent; + + return task_pid_nr_ns(p, event->ns); +} + +static void perf_output_read_one(struct perf_output_handle *handle, + struct perf_event *event) +{ + u64 read_format = event->attr.read_format; + u64 values[4]; + int n = 0; + + values[n++] = atomic64_read(&event->count); + if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) { + values[n++] = event->total_time_enabled + + atomic64_read(&event->child_total_time_enabled); + } + if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) { + values[n++] = event->total_time_running + + atomic64_read(&event->child_total_time_running); + } + if (read_format & PERF_FORMAT_ID) + values[n++] = primary_event_id(event); + + perf_output_copy(handle, values, n * sizeof(u64)); +} + +/* + * XXX PERF_FORMAT_GROUP vs inherited events seems difficult. + */ +static void perf_output_read_group(struct perf_output_handle *handle, + struct perf_event *event) +{ + struct perf_event *leader = event->group_leader, *sub; + u64 read_format = event->attr.read_format; + u64 values[5]; + int n = 0; + + values[n++] = 1 + leader->nr_siblings; + + if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) + values[n++] = leader->total_time_enabled; + + if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) + values[n++] = leader->total_time_running; + + if (leader != event) + leader->pmu->read(leader); + + values[n++] = atomic64_read(&leader->count); + if (read_format & PERF_FORMAT_ID) + values[n++] = primary_event_id(leader); + + perf_output_copy(handle, values, n * sizeof(u64)); + + list_for_each_entry(sub, &leader->sibling_list, group_entry) { + n = 0; + + if (sub != event) + sub->pmu->read(sub); + + values[n++] = atomic64_read(&sub->count); + if (read_format & PERF_FORMAT_ID) + values[n++] = primary_event_id(sub); + + perf_output_copy(handle, values, n * sizeof(u64)); + } +} + +static void perf_output_read(struct perf_output_handle *handle, + struct perf_event *event) +{ + if (event->attr.read_format & PERF_FORMAT_GROUP) + perf_output_read_group(handle, event); + else + perf_output_read_one(handle, event); +} + +void perf_output_sample(struct perf_output_handle *handle, + struct perf_event_header *header, + struct perf_sample_data *data, + struct perf_event *event) +{ + u64 sample_type = data->type; + + perf_output_put(handle, *header); + + if (sample_type & PERF_SAMPLE_IP) + perf_output_put(handle, data->ip); + + if (sample_type & PERF_SAMPLE_TID) + perf_output_put(handle, data->tid_entry); + + if (sample_type & PERF_SAMPLE_TIME) + perf_output_put(handle, data->time); + + if (sample_type & PERF_SAMPLE_ADDR) + perf_output_put(handle, data->addr); + + if (sample_type & PERF_SAMPLE_ID) + perf_output_put(handle, data->id); + + if (sample_type & PERF_SAMPLE_STREAM_ID) + perf_output_put(handle, data->stream_id); + + if (sample_type & PERF_SAMPLE_CPU) + perf_output_put(handle, data->cpu_entry); + + if (sample_type & PERF_SAMPLE_PERIOD) + perf_output_put(handle, data->period); + + if (sample_type & PERF_SAMPLE_READ) + perf_output_read(handle, event); + + if (sample_type & PERF_SAMPLE_CALLCHAIN) { + if (data->callchain) { + int size = 1; + + if (data->callchain) + size += data->callchain->nr; + + size *= sizeof(u64); + + perf_output_copy(handle, data->callchain, size); + } else { + u64 nr = 0; + perf_output_put(handle, nr); + } + } + + if (sample_type & PERF_SAMPLE_RAW) { + if (data->raw) { + perf_output_put(handle, data->raw->size); + perf_output_copy(handle, data->raw->data, + data->raw->size); + } else { + struct { + u32 size; + u32 data; + } raw = { + .size = sizeof(u32), + .data = 0, + }; + perf_output_put(handle, raw); + } + } +} + +void perf_prepare_sample(struct perf_event_header *header, + struct perf_sample_data *data, + struct perf_event *event, + struct pt_regs *regs) +{ + u64 sample_type = event->attr.sample_type; + + data->type = sample_type; + + header->type = PERF_RECORD_SAMPLE; + header->size = sizeof(*header); + + header->misc = 0; + header->misc |= perf_misc_flags(regs); + + if (sample_type & PERF_SAMPLE_IP) { + data->ip = perf_instruction_pointer(regs); + + header->size += sizeof(data->ip); + } + + if (sample_type & PERF_SAMPLE_TID) { + /* namespace issues */ + data->tid_entry.pid = perf_event_pid(event, current); + data->tid_entry.tid = perf_event_tid(event, current); + + header->size += sizeof(data->tid_entry); + } + + if (sample_type & PERF_SAMPLE_TIME) { + data->time = perf_clock(); + + header->size += sizeof(data->time); + } + + if (sample_type & PERF_SAMPLE_ADDR) + header->size += sizeof(data->addr); + + if (sample_type & PERF_SAMPLE_ID) { + data->id = primary_event_id(event); + + header->size += sizeof(data->id); + } + + if (sample_type & PERF_SAMPLE_STREAM_ID) { + data->stream_id = event->id; + + header->size += sizeof(data->stream_id); + } + + if (sample_type & PERF_SAMPLE_CPU) { + data->cpu_entry.cpu = raw_smp_processor_id(); + data->cpu_entry.reserved = 0; + + header->size += sizeof(data->cpu_entry); + } + + if (sample_type & PERF_SAMPLE_PERIOD) + header->size += sizeof(data->period); + + if (sample_type & PERF_SAMPLE_READ) + header->size += perf_event_read_size(event); + + if (sample_type & PERF_SAMPLE_CALLCHAIN) { + int size = 1; + + data->callchain = perf_callchain(regs); + + if (data->callchain) + size += data->callchain->nr; + + header->size += size * sizeof(u64); + } + + if (sample_type & PERF_SAMPLE_RAW) { + int size = sizeof(u32); + + if (data->raw) + size += data->raw->size; + else + size += sizeof(u32); + + WARN_ON_ONCE(size & (sizeof(u64)-1)); + header->size += size; + } +} + +static void perf_event_output(struct perf_event *event, int nmi, + struct perf_sample_data *data, + struct pt_regs *regs) +{ + struct perf_output_handle handle; + struct perf_event_header header; + + perf_prepare_sample(&header, data, event, regs); + + if (perf_output_begin(&handle, event, header.size, nmi, 1)) + return; + + perf_output_sample(&handle, &header, data, event); + + perf_output_end(&handle); +} + +/* + * read event_id + */ + +struct perf_read_event { + struct perf_event_header header; + + u32 pid; + u32 tid; +}; + +static void +perf_event_read_event(struct perf_event *event, + struct task_struct *task) +{ + struct perf_output_handle handle; + struct perf_read_event read_event = { + .header = { + .type = PERF_RECORD_READ, + .misc = 0, + .size = sizeof(read_event) + perf_event_read_size(event), + }, + .pid = perf_event_pid(event, task), + .tid = perf_event_tid(event, task), + }; + int ret; + + ret = perf_output_begin(&handle, event, read_event.header.size, 0, 0); + if (ret) + return; + + perf_output_put(&handle, read_event); + perf_output_read(&handle, event); + + perf_output_end(&handle); +} + +/* + * task tracking -- fork/exit + * + * enabled by: attr.comm | attr.mmap | attr.task + */ + +struct perf_task_event { + struct task_struct *task; + struct perf_event_context *task_ctx; + + struct { + struct perf_event_header header; + + u32 pid; + u32 ppid; + u32 tid; + u32 ptid; + u64 time; + } event_id; +}; + +static void perf_event_task_output(struct perf_event *event, + struct perf_task_event *task_event) +{ + struct perf_output_handle handle; + int size; + struct task_struct *task = task_event->task; + int ret; + + size = task_event->event_id.header.size; + ret = perf_output_begin(&handle, event, size, 0, 0); + + if (ret) + return; + + task_event->event_id.pid = perf_event_pid(event, task); + task_event->event_id.ppid = perf_event_pid(event, current); + + task_event->event_id.tid = perf_event_tid(event, task); + task_event->event_id.ptid = perf_event_tid(event, current); + + task_event->event_id.time = perf_clock(); + + perf_output_put(&handle, task_event->event_id); + + perf_output_end(&handle); +} + +static int perf_event_task_match(struct perf_event *event) +{ + if (event->attr.comm || event->attr.mmap || event->attr.task) + return 1; + + return 0; +} + +static void perf_event_task_ctx(struct perf_event_context *ctx, + struct perf_task_event *task_event) +{ + struct perf_event *event; + + if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list)) + return; + + rcu_read_lock(); + list_for_each_entry_rcu(event, &ctx->event_list, event_entry) { + if (perf_event_task_match(event)) + perf_event_task_output(event, task_event); + } + rcu_read_unlock(); +} + +static void perf_event_task_event(struct perf_task_event *task_event) +{ + struct perf_cpu_context *cpuctx; + struct perf_event_context *ctx = task_event->task_ctx; + + cpuctx = &get_cpu_var(perf_cpu_context); + perf_event_task_ctx(&cpuctx->ctx, task_event); + put_cpu_var(perf_cpu_context); + + rcu_read_lock(); + if (!ctx) + ctx = rcu_dereference(task_event->task->perf_event_ctxp); + if (ctx) + perf_event_task_ctx(ctx, task_event); + rcu_read_unlock(); +} + +static void perf_event_task(struct task_struct *task, + struct perf_event_context *task_ctx, + int new) +{ + struct perf_task_event task_event; + + if (!atomic_read(&nr_comm_events) && + !atomic_read(&nr_mmap_events) && + !atomic_read(&nr_task_events)) + return; + + task_event = (struct perf_task_event){ + .task = task, + .task_ctx = task_ctx, + .event_id = { + .header = { + .type = new ? PERF_RECORD_FORK : PERF_RECORD_EXIT, + .misc = 0, + .size = sizeof(task_event.event_id), + }, + /* .pid */ + /* .ppid */ + /* .tid */ + /* .ptid */ + }, + }; + + perf_event_task_event(&task_event); +} + +void perf_event_fork(struct task_struct *task) +{ + perf_event_task(task, NULL, 1); +} + +/* + * comm tracking + */ + +struct perf_comm_event { + struct task_struct *task; + char *comm; + int comm_size; + + struct { + struct perf_event_header header; + + u32 pid; + u32 tid; + } event_id; +}; + +static void perf_event_comm_output(struct perf_event *event, + struct perf_comm_event *comm_event) +{ + struct perf_output_handle handle; + int size = comm_event->event_id.header.size; + int ret = perf_output_begin(&handle, event, size, 0, 0); + + if (ret) + return; + + comm_event->event_id.pid = perf_event_pid(event, comm_event->task); + comm_event->event_id.tid = perf_event_tid(event, comm_event->task); + + perf_output_put(&handle, comm_event->event_id); + perf_output_copy(&handle, comm_event->comm, + comm_event->comm_size); + perf_output_end(&handle); +} + +static int perf_event_comm_match(struct perf_event *event) +{ + if (event->attr.comm) + return 1; + + return 0; +} + +static void perf_event_comm_ctx(struct perf_event_context *ctx, + struct perf_comm_event *comm_event) +{ + struct perf_event *event; + + if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list)) + return; + + rcu_read_lock(); + list_for_each_entry_rcu(event, &ctx->event_list, event_entry) { + if (perf_event_comm_match(event)) + perf_event_comm_output(event, comm_event); + } + rcu_read_unlock(); +} + +static void perf_event_comm_event(struct perf_comm_event *comm_event) +{ + struct perf_cpu_context *cpuctx; + struct perf_event_context *ctx; + unsigned int size; + char comm[TASK_COMM_LEN]; + + memset(comm, 0, sizeof(comm)); + strncpy(comm, comm_event->task->comm, sizeof(comm)); + size = ALIGN(strlen(comm)+1, sizeof(u64)); + + comm_event->comm = comm; + comm_event->comm_size = size; + + comm_event->event_id.header.size = sizeof(comm_event->event_id) + size; + + cpuctx = &get_cpu_var(perf_cpu_context); + perf_event_comm_ctx(&cpuctx->ctx, comm_event); + put_cpu_var(perf_cpu_context); + + rcu_read_lock(); + /* + * doesn't really matter which of the child contexts the + * events ends up in. + */ + ctx = rcu_dereference(current->perf_event_ctxp); + if (ctx) + perf_event_comm_ctx(ctx, comm_event); + rcu_read_unlock(); +} + +void perf_event_comm(struct task_struct *task) +{ + struct perf_comm_event comm_event; + + if (task->perf_event_ctxp) + perf_event_enable_on_exec(task); + + if (!atomic_read(&nr_comm_events)) + return; + + comm_event = (struct perf_comm_event){ + .task = task, + /* .comm */ + /* .comm_size */ + .event_id = { + .header = { + .type = PERF_RECORD_COMM, + .misc = 0, + /* .size */ + }, + /* .pid */ + /* .tid */ + }, + }; + + perf_event_comm_event(&comm_event); +} + +/* + * mmap tracking + */ + +struct perf_mmap_event { + struct vm_area_struct *vma; + + const char *file_name; + int file_size; + + struct { + struct perf_event_header header; + + u32 pid; + u32 tid; + u64 start; + u64 len; + u64 pgoff; + } event_id; +}; + +static void perf_event_mmap_output(struct perf_event *event, + struct perf_mmap_event *mmap_event) +{ + struct perf_output_handle handle; + int size = mmap_event->event_id.header.size; + int ret = perf_output_begin(&handle, event, size, 0, 0); + + if (ret) + return; + + mmap_event->event_id.pid = perf_event_pid(event, current); + mmap_event->event_id.tid = perf_event_tid(event, current); + + perf_output_put(&handle, mmap_event->event_id); + perf_output_copy(&handle, mmap_event->file_name, + mmap_event->file_size); + perf_output_end(&handle); +} + +static int perf_event_mmap_match(struct perf_event *event, + struct perf_mmap_event *mmap_event) +{ + if (event->attr.mmap) + return 1; + + return 0; +} + +static void perf_event_mmap_ctx(struct perf_event_context *ctx, + struct perf_mmap_event *mmap_event) +{ + struct perf_event *event; + + if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list)) + return; + + rcu_read_lock(); + list_for_each_entry_rcu(event, &ctx->event_list, event_entry) { + if (perf_event_mmap_match(event, mmap_event)) + perf_event_mmap_output(event, mmap_event); + } + rcu_read_unlock(); +} + +static void perf_event_mmap_event(struct perf_mmap_event *mmap_event) +{ + struct perf_cpu_context *cpuctx; + struct perf_event_context *ctx; + struct vm_area_struct *vma = mmap_event->vma; + struct file *file = vma->vm_file; + unsigned int size; + char tmp[16]; + char *buf = NULL; + const char *name; + + memset(tmp, 0, sizeof(tmp)); + + if (file) { + /* + * d_path works from the end of the buffer backwards, so we + * need to add enough zero bytes after the string to handle + * the 64bit alignment we do later. + */ + buf = kzalloc(PATH_MAX + sizeof(u64), GFP_KERNEL); + if (!buf) { + name = strncpy(tmp, "//enomem", sizeof(tmp)); + goto got_name; + } + name = d_path(&file->f_path, buf, PATH_MAX); + if (IS_ERR(name)) { + name = strncpy(tmp, "//toolong", sizeof(tmp)); + goto got_name; + } + } else { + if (arch_vma_name(mmap_event->vma)) { + name = strncpy(tmp, arch_vma_name(mmap_event->vma), + sizeof(tmp)); + goto got_name; + } + + if (!vma->vm_mm) { + name = strncpy(tmp, "[vdso]", sizeof(tmp)); + goto got_name; + } + + name = strncpy(tmp, "//anon", sizeof(tmp)); + goto got_name; + } + +got_name: + size = ALIGN(strlen(name)+1, sizeof(u64)); + + mmap_event->file_name = name; + mmap_event->file_size = size; + + mmap_event->event_id.header.size = sizeof(mmap_event->event_id) + size; + + cpuctx = &get_cpu_var(perf_cpu_context); + perf_event_mmap_ctx(&cpuctx->ctx, mmap_event); + put_cpu_var(perf_cpu_context); + + rcu_read_lock(); + /* + * doesn't really matter which of the child contexts the + * events ends up in. + */ + ctx = rcu_dereference(current->perf_event_ctxp); + if (ctx) + perf_event_mmap_ctx(ctx, mmap_event); + rcu_read_unlock(); + + kfree(buf); +} + +void __perf_event_mmap(struct vm_area_struct *vma) +{ + struct perf_mmap_event mmap_event; + + if (!atomic_read(&nr_mmap_events)) + return; + + mmap_event = (struct perf_mmap_event){ + .vma = vma, + /* .file_name */ + /* .file_size */ + .event_id = { + .header = { + .type = PERF_RECORD_MMAP, + .misc = 0, + /* .size */ + }, + /* .pid */ + /* .tid */ + .start = vma->vm_start, + .len = vma->vm_end - vma->vm_start, + .pgoff = vma->vm_pgoff, + }, + }; + + perf_event_mmap_event(&mmap_event); +} + +/* + * IRQ throttle logging + */ + +static void perf_log_throttle(struct perf_event *event, int enable) +{ + struct perf_output_handle handle; + int ret; + + struct { + struct perf_event_header header; + u64 time; + u64 id; + u64 stream_id; + } throttle_event = { + .header = { + .type = PERF_RECORD_THROTTLE, + .misc = 0, + .size = sizeof(throttle_event), + }, + .time = perf_clock(), + .id = primary_event_id(event), + .stream_id = event->id, + }; + + if (enable) + throttle_event.header.type = PERF_RECORD_UNTHROTTLE; + + ret = perf_output_begin(&handle, event, sizeof(throttle_event), 1, 0); + if (ret) + return; + + perf_output_put(&handle, throttle_event); + perf_output_end(&handle); +} + +/* + * Generic event overflow handling, sampling. + */ + +static int __perf_event_overflow(struct perf_event *event, int nmi, + int throttle, struct perf_sample_data *data, + struct pt_regs *regs) +{ + int events = atomic_read(&event->event_limit); + struct hw_perf_event *hwc = &event->hw; + int ret = 0; + + throttle = (throttle && event->pmu->unthrottle != NULL); + + if (!throttle) { + hwc->interrupts++; + } else { + if (hwc->interrupts != MAX_INTERRUPTS) { + hwc->interrupts++; + if (HZ * hwc->interrupts > + (u64)sysctl_perf_event_sample_rate) { + hwc->interrupts = MAX_INTERRUPTS; + perf_log_throttle(event, 0); + ret = 1; + } + } else { + /* + * Keep re-disabling events even though on the previous + * pass we disabled it - just in case we raced with a + * sched-in and the event got enabled again: + */ + ret = 1; + } + } + + if (event->attr.freq) { + u64 now = perf_clock(); + s64 delta = now - hwc->freq_stamp; + + hwc->freq_stamp = now; + + if (delta > 0 && delta < TICK_NSEC) + perf_adjust_period(event, NSEC_PER_SEC / (int)delta); + } + + /* + * XXX event_limit might not quite work as expected on inherited + * events + */ + + event->pending_kill = POLL_IN; + if (events && atomic_dec_and_test(&event->event_limit)) { + ret = 1; + event->pending_kill = POLL_HUP; + if (nmi) { + event->pending_disable = 1; + perf_pending_queue(&event->pending, + perf_pending_event); + } else + perf_event_disable(event); + } + + perf_event_output(event, nmi, data, regs); + return ret; +} + +int perf_event_overflow(struct perf_event *event, int nmi, + struct perf_sample_data *data, + struct pt_regs *regs) +{ + return __perf_event_overflow(event, nmi, 1, data, regs); +} + +/* + * Generic software event infrastructure + */ + +/* + * We directly increment event->count and keep a second value in + * event->hw.period_left to count intervals. This period event + * is kept in the range [-sample_period, 0] so that we can use the + * sign as trigger. + */ + +static u64 perf_swevent_set_period(struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + u64 period = hwc->last_period; + u64 nr, offset; + s64 old, val; + + hwc->last_period = hwc->sample_period; + +again: + old = val = atomic64_read(&hwc->period_left); + if (val < 0) + return 0; + + nr = div64_u64(period + val, period); + offset = nr * period; + val -= offset; + if (atomic64_cmpxchg(&hwc->period_left, old, val) != old) + goto again; + + return nr; +} + +static void perf_swevent_overflow(struct perf_event *event, + int nmi, struct perf_sample_data *data, + struct pt_regs *regs) +{ + struct hw_perf_event *hwc = &event->hw; + int throttle = 0; + u64 overflow; + + data->period = event->hw.last_period; + overflow = perf_swevent_set_period(event); + + if (hwc->interrupts == MAX_INTERRUPTS) + return; + + for (; overflow; overflow--) { + if (__perf_event_overflow(event, nmi, throttle, + data, regs)) { + /* + * We inhibit the overflow from happening when + * hwc->interrupts == MAX_INTERRUPTS. + */ + break; + } + throttle = 1; + } +} + +static void perf_swevent_unthrottle(struct perf_event *event) +{ + /* + * Nothing to do, we already reset hwc->interrupts. + */ +} + +static void perf_swevent_add(struct perf_event *event, u64 nr, + int nmi, struct perf_sample_data *data, + struct pt_regs *regs) +{ + struct hw_perf_event *hwc = &event->hw; + + atomic64_add(nr, &event->count); + + if (!hwc->sample_period) + return; + + if (!regs) + return; + + if (!atomic64_add_negative(nr, &hwc->period_left)) + perf_swevent_overflow(event, nmi, data, regs); +} + +static int perf_swevent_is_counting(struct perf_event *event) +{ + /* + * The event is active, we're good! + */ + if (event->state == PERF_EVENT_STATE_ACTIVE) + return 1; + + /* + * The event is off/error, not counting. + */ + if (event->state != PERF_EVENT_STATE_INACTIVE) + return 0; + + /* + * The event is inactive, if the context is active + * we're part of a group that didn't make it on the 'pmu', + * not counting. + */ + if (event->ctx->is_active) + return 0; + + /* + * We're inactive and the context is too, this means the + * task is scheduled out, we're counting events that happen + * to us, like migration events. + */ + return 1; +} + +static int perf_swevent_match(struct perf_event *event, + enum perf_type_id type, + u32 event_id, struct pt_regs *regs) +{ + if (!perf_swevent_is_counting(event)) + return 0; + + if (event->attr.type != type) + return 0; + if (event->attr.config != event_id) + return 0; + + if (regs) { + if (event->attr.exclude_user && user_mode(regs)) + return 0; + + if (event->attr.exclude_kernel && !user_mode(regs)) + return 0; + } + + return 1; +} + +static void perf_swevent_ctx_event(struct perf_event_context *ctx, + enum perf_type_id type, + u32 event_id, u64 nr, int nmi, + struct perf_sample_data *data, + struct pt_regs *regs) +{ + struct perf_event *event; + + if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list)) + return; + + rcu_read_lock(); + list_for_each_entry_rcu(event, &ctx->event_list, event_entry) { + if (perf_swevent_match(event, type, event_id, regs)) + perf_swevent_add(event, nr, nmi, data, regs); + } + rcu_read_unlock(); +} + +static int *perf_swevent_recursion_context(struct perf_cpu_context *cpuctx) +{ + if (in_nmi()) + return &cpuctx->recursion[3]; + + if (in_irq()) + return &cpuctx->recursion[2]; + + if (in_softirq()) + return &cpuctx->recursion[1]; + + return &cpuctx->recursion[0]; +} + +static void do_perf_sw_event(enum perf_type_id type, u32 event_id, + u64 nr, int nmi, + struct perf_sample_data *data, + struct pt_regs *regs) +{ + struct perf_cpu_context *cpuctx = &get_cpu_var(perf_cpu_context); + int *recursion = perf_swevent_recursion_context(cpuctx); + struct perf_event_context *ctx; + + if (*recursion) + goto out; + + (*recursion)++; + barrier(); + + perf_swevent_ctx_event(&cpuctx->ctx, type, event_id, + nr, nmi, data, regs); + rcu_read_lock(); + /* + * doesn't really matter which of the child contexts the + * events ends up in. + */ + ctx = rcu_dereference(current->perf_event_ctxp); + if (ctx) + perf_swevent_ctx_event(ctx, type, event_id, nr, nmi, data, regs); + rcu_read_unlock(); + + barrier(); + (*recursion)--; + +out: + put_cpu_var(perf_cpu_context); +} + +void __perf_sw_event(u32 event_id, u64 nr, int nmi, + struct pt_regs *regs, u64 addr) +{ + struct perf_sample_data data = { + .addr = addr, + }; + + do_perf_sw_event(PERF_TYPE_SOFTWARE, event_id, nr, nmi, + &data, regs); +} + +static void perf_swevent_read(struct perf_event *event) +{ +} + +static int perf_swevent_enable(struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + + if (hwc->sample_period) { + hwc->last_period = hwc->sample_period; + perf_swevent_set_period(event); + } + return 0; +} + +static void perf_swevent_disable(struct perf_event *event) +{ +} + +static const struct pmu perf_ops_generic = { + .enable = perf_swevent_enable, + .disable = perf_swevent_disable, + .read = perf_swevent_read, + .unthrottle = perf_swevent_unthrottle, +}; + +/* + * hrtimer based swevent callback + */ + +static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer) +{ + enum hrtimer_restart ret = HRTIMER_RESTART; + struct perf_sample_data data; + struct pt_regs *regs; + struct perf_event *event; + u64 period; + + event = container_of(hrtimer, struct perf_event, hw.hrtimer); + event->pmu->read(event); + + data.addr = 0; + regs = get_irq_regs(); + /* + * In case we exclude kernel IPs or are somehow not in interrupt + * context, provide the next best thing, the user IP. + */ + if ((event->attr.exclude_kernel || !regs) && + !event->attr.exclude_user) + regs = task_pt_regs(current); + + if (regs) { + if (perf_event_overflow(event, 0, &data, regs)) + ret = HRTIMER_NORESTART; + } + + period = max_t(u64, 10000, event->hw.sample_period); + hrtimer_forward_now(hrtimer, ns_to_ktime(period)); + + return ret; +} + +/* + * Software event: cpu wall time clock + */ + +static void cpu_clock_perf_event_update(struct perf_event *event) +{ + int cpu = raw_smp_processor_id(); + s64 prev; + u64 now; + + now = cpu_clock(cpu); + prev = atomic64_read(&event->hw.prev_count); + atomic64_set(&event->hw.prev_count, now); + atomic64_add(now - prev, &event->count); +} + +static int cpu_clock_perf_event_enable(struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + int cpu = raw_smp_processor_id(); + + atomic64_set(&hwc->prev_count, cpu_clock(cpu)); + hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); + hwc->hrtimer.function = perf_swevent_hrtimer; + if (hwc->sample_period) { + u64 period = max_t(u64, 10000, hwc->sample_period); + __hrtimer_start_range_ns(&hwc->hrtimer, + ns_to_ktime(period), 0, + HRTIMER_MODE_REL, 0); + } + + return 0; +} + +static void cpu_clock_perf_event_disable(struct perf_event *event) +{ + if (event->hw.sample_period) + hrtimer_cancel(&event->hw.hrtimer); + cpu_clock_perf_event_update(event); +} + +static void cpu_clock_perf_event_read(struct perf_event *event) +{ + cpu_clock_perf_event_update(event); +} + +static const struct pmu perf_ops_cpu_clock = { + .enable = cpu_clock_perf_event_enable, + .disable = cpu_clock_perf_event_disable, + .read = cpu_clock_perf_event_read, +}; + +/* + * Software event: task time clock + */ + +static void task_clock_perf_event_update(struct perf_event *event, u64 now) +{ + u64 prev; + s64 delta; + + prev = atomic64_xchg(&event->hw.prev_count, now); + delta = now - prev; + atomic64_add(delta, &event->count); +} + +static int task_clock_perf_event_enable(struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + u64 now; + + now = event->ctx->time; + + atomic64_set(&hwc->prev_count, now); + hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); + hwc->hrtimer.function = perf_swevent_hrtimer; + if (hwc->sample_period) { + u64 period = max_t(u64, 10000, hwc->sample_period); + __hrtimer_start_range_ns(&hwc->hrtimer, + ns_to_ktime(period), 0, + HRTIMER_MODE_REL, 0); + } + + return 0; +} + +static void task_clock_perf_event_disable(struct perf_event *event) +{ + if (event->hw.sample_period) + hrtimer_cancel(&event->hw.hrtimer); + task_clock_perf_event_update(event, event->ctx->time); + +} + +static void task_clock_perf_event_read(struct perf_event *event) +{ + u64 time; + + if (!in_nmi()) { + update_context_time(event->ctx); + time = event->ctx->time; + } else { + u64 now = perf_clock(); + u64 delta = now - event->ctx->timestamp; + time = event->ctx->time + delta; + } + + task_clock_perf_event_update(event, time); +} + +static const struct pmu perf_ops_task_clock = { + .enable = task_clock_perf_event_enable, + .disable = task_clock_perf_event_disable, + .read = task_clock_perf_event_read, +}; + +#ifdef CONFIG_EVENT_PROFILE +void perf_tp_event(int event_id, u64 addr, u64 count, void *record, + int entry_size) +{ + struct perf_raw_record raw = { + .size = entry_size, + .data = record, + }; + + struct perf_sample_data data = { + .addr = addr, + .raw = &raw, + }; + + struct pt_regs *regs = get_irq_regs(); + + if (!regs) + regs = task_pt_regs(current); + + do_perf_sw_event(PERF_TYPE_TRACEPOINT, event_id, count, 1, + &data, regs); +} +EXPORT_SYMBOL_GPL(perf_tp_event); + +extern int ftrace_profile_enable(int); +extern void ftrace_profile_disable(int); + +static void tp_perf_event_destroy(struct perf_event *event) +{ + ftrace_profile_disable(event->attr.config); +} + +static const struct pmu *tp_perf_event_init(struct perf_event *event) +{ + /* + * Raw tracepoint data is a severe data leak, only allow root to + * have these. + */ + if ((event->attr.sample_type & PERF_SAMPLE_RAW) && + perf_paranoid_tracepoint_raw() && + !capable(CAP_SYS_ADMIN)) + return ERR_PTR(-EPERM); + + if (ftrace_profile_enable(event->attr.config)) + return NULL; + + event->destroy = tp_perf_event_destroy; + + return &perf_ops_generic; +} +#else +static const struct pmu *tp_perf_event_init(struct perf_event *event) +{ + return NULL; +} +#endif + +atomic_t perf_swevent_enabled[PERF_COUNT_SW_MAX]; + +static void sw_perf_event_destroy(struct perf_event *event) +{ + u64 event_id = event->attr.config; + + WARN_ON(event->parent); + + atomic_dec(&perf_swevent_enabled[event_id]); +} + +static const struct pmu *sw_perf_event_init(struct perf_event *event) +{ + const struct pmu *pmu = NULL; + u64 event_id = event->attr.config; + + /* + * Software events (currently) can't in general distinguish + * between user, kernel and hypervisor events. + * However, context switches and cpu migrations are considered + * to be kernel events, and page faults are never hypervisor + * events. + */ + switch (event_id) { + case PERF_COUNT_SW_CPU_CLOCK: + pmu = &perf_ops_cpu_clock; + + break; + case PERF_COUNT_SW_TASK_CLOCK: + /* + * If the user instantiates this as a per-cpu event, + * use the cpu_clock event instead. + */ + if (event->ctx->task) + pmu = &perf_ops_task_clock; + else + pmu = &perf_ops_cpu_clock; + + break; + case PERF_COUNT_SW_PAGE_FAULTS: + case PERF_COUNT_SW_PAGE_FAULTS_MIN: + case PERF_COUNT_SW_PAGE_FAULTS_MAJ: + case PERF_COUNT_SW_CONTEXT_SWITCHES: + case PERF_COUNT_SW_CPU_MIGRATIONS: + if (!event->parent) { + atomic_inc(&perf_swevent_enabled[event_id]); + event->destroy = sw_perf_event_destroy; + } + pmu = &perf_ops_generic; + break; + } + + return pmu; +} + +/* + * Allocate and initialize a event structure + */ +static struct perf_event * +perf_event_alloc(struct perf_event_attr *attr, + int cpu, + struct perf_event_context *ctx, + struct perf_event *group_leader, + struct perf_event *parent_event, + gfp_t gfpflags) +{ + const struct pmu *pmu; + struct perf_event *event; + struct hw_perf_event *hwc; + long err; + + event = kzalloc(sizeof(*event), gfpflags); + if (!event) + return ERR_PTR(-ENOMEM); + + /* + * Single events are their own group leaders, with an + * empty sibling list: + */ + if (!group_leader) + group_leader = event; + + mutex_init(&event->child_mutex); + INIT_LIST_HEAD(&event->child_list); + + INIT_LIST_HEAD(&event->group_entry); + INIT_LIST_HEAD(&event->event_entry); + INIT_LIST_HEAD(&event->sibling_list); + init_waitqueue_head(&event->waitq); + + mutex_init(&event->mmap_mutex); + + event->cpu = cpu; + event->attr = *attr; + event->group_leader = group_leader; + event->pmu = NULL; + event->ctx = ctx; + event->oncpu = -1; + + event->parent = parent_event; + + event->ns = get_pid_ns(current->nsproxy->pid_ns); + event->id = atomic64_inc_return(&perf_event_id); + + event->state = PERF_EVENT_STATE_INACTIVE; + + if (attr->disabled) + event->state = PERF_EVENT_STATE_OFF; + + pmu = NULL; + + hwc = &event->hw; + hwc->sample_period = attr->sample_period; + if (attr->freq && attr->sample_freq) + hwc->sample_period = 1; + hwc->last_period = hwc->sample_period; + + atomic64_set(&hwc->period_left, hwc->sample_period); + + /* + * we currently do not support PERF_FORMAT_GROUP on inherited events + */ + if (attr->inherit && (attr->read_format & PERF_FORMAT_GROUP)) + goto done; + + switch (attr->type) { + case PERF_TYPE_RAW: + case PERF_TYPE_HARDWARE: + case PERF_TYPE_HW_CACHE: + pmu = hw_perf_event_init(event); + break; + + case PERF_TYPE_SOFTWARE: + pmu = sw_perf_event_init(event); + break; + + case PERF_TYPE_TRACEPOINT: + pmu = tp_perf_event_init(event); + break; + + default: + break; + } +done: + err = 0; + if (!pmu) + err = -EINVAL; + else if (IS_ERR(pmu)) + err = PTR_ERR(pmu); + + if (err) { + if (event->ns) + put_pid_ns(event->ns); + kfree(event); + return ERR_PTR(err); + } + + event->pmu = pmu; + + if (!event->parent) { + atomic_inc(&nr_events); + if (event->attr.mmap) + atomic_inc(&nr_mmap_events); + if (event->attr.comm) + atomic_inc(&nr_comm_events); + if (event->attr.task) + atomic_inc(&nr_task_events); + } + + return event; +} + +static int perf_copy_attr(struct perf_event_attr __user *uattr, + struct perf_event_attr *attr) +{ + u32 size; + int ret; + + if (!access_ok(VERIFY_WRITE, uattr, PERF_ATTR_SIZE_VER0)) + return -EFAULT; + + /* + * zero the full structure, so that a short copy will be nice. + */ + memset(attr, 0, sizeof(*attr)); + + ret = get_user(size, &uattr->size); + if (ret) + return ret; + + if (size > PAGE_SIZE) /* silly large */ + goto err_size; + + if (!size) /* abi compat */ + size = PERF_ATTR_SIZE_VER0; + + if (size < PERF_ATTR_SIZE_VER0) + goto err_size; + + /* + * If we're handed a bigger struct than we know of, + * ensure all the unknown bits are 0 - i.e. new + * user-space does not rely on any kernel feature + * extensions we dont know about yet. + */ + if (size > sizeof(*attr)) { + unsigned char __user *addr; + unsigned char __user *end; + unsigned char val; + + addr = (void __user *)uattr + sizeof(*attr); + end = (void __user *)uattr + size; + + for (; addr < end; addr++) { + ret = get_user(val, addr); + if (ret) + return ret; + if (val) + goto err_size; + } + size = sizeof(*attr); + } + + ret = copy_from_user(attr, uattr, size); + if (ret) + return -EFAULT; + + /* + * If the type exists, the corresponding creation will verify + * the attr->config. + */ + if (attr->type >= PERF_TYPE_MAX) + return -EINVAL; + + if (attr->__reserved_1 || attr->__reserved_2 || attr->__reserved_3) + return -EINVAL; + + if (attr->sample_type & ~(PERF_SAMPLE_MAX-1)) + return -EINVAL; + + if (attr->read_format & ~(PERF_FORMAT_MAX-1)) + return -EINVAL; + +out: + return ret; + +err_size: + put_user(sizeof(*attr), &uattr->size); + ret = -E2BIG; + goto out; +} + +int perf_event_set_output(struct perf_event *event, int output_fd) +{ + struct perf_event *output_event = NULL; + struct file *output_file = NULL; + struct perf_event *old_output; + int fput_needed = 0; + int ret = -EINVAL; + + if (!output_fd) + goto set; + + output_file = fget_light(output_fd, &fput_needed); + if (!output_file) + return -EBADF; + + if (output_file->f_op != &perf_fops) + goto out; + + output_event = output_file->private_data; + + /* Don't chain output fds */ + if (output_event->output) + goto out; + + /* Don't set an output fd when we already have an output channel */ + if (event->data) + goto out; + + atomic_long_inc(&output_file->f_count); + +set: + mutex_lock(&event->mmap_mutex); + old_output = event->output; + rcu_assign_pointer(event->output, output_event); + mutex_unlock(&event->mmap_mutex); + + if (old_output) { + /* + * we need to make sure no existing perf_output_*() + * is still referencing this event. + */ + synchronize_rcu(); + fput(old_output->filp); + } + + ret = 0; +out: + fput_light(output_file, fput_needed); + return ret; +} + +/** + * sys_perf_event_open - open a performance event, associate it to a task/cpu + * + * @attr_uptr: event_id type attributes for monitoring/sampling + * @pid: target pid + * @cpu: target cpu + * @group_fd: group leader event fd + */ +SYSCALL_DEFINE5(perf_event_open, + struct perf_event_attr __user *, attr_uptr, + pid_t, pid, int, cpu, int, group_fd, unsigned long, flags) +{ + struct perf_event *event, *group_leader; + struct perf_event_attr attr; + struct perf_event_context *ctx; + struct file *event_file = NULL; + struct file *group_file = NULL; + int fput_needed = 0; + int fput_needed2 = 0; + int err; + + /* for future expandability... */ + if (flags & ~(PERF_FLAG_FD_NO_GROUP | PERF_FLAG_FD_OUTPUT)) + return -EINVAL; + + err = perf_copy_attr(attr_uptr, &attr); + if (err) + return err; + + if (!attr.exclude_kernel) { + if (perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN)) + return -EACCES; + } + + if (attr.freq) { + if (attr.sample_freq > sysctl_perf_event_sample_rate) + return -EINVAL; + } + + /* + * Get the target context (task or percpu): + */ + ctx = find_get_context(pid, cpu); + if (IS_ERR(ctx)) + return PTR_ERR(ctx); + + /* + * Look up the group leader (we will attach this event to it): + */ + group_leader = NULL; + if (group_fd != -1 && !(flags & PERF_FLAG_FD_NO_GROUP)) { + err = -EINVAL; + group_file = fget_light(group_fd, &fput_needed); + if (!group_file) + goto err_put_context; + if (group_file->f_op != &perf_fops) + goto err_put_context; + + group_leader = group_file->private_data; + /* + * Do not allow a recursive hierarchy (this new sibling + * becoming part of another group-sibling): + */ + if (group_leader->group_leader != group_leader) + goto err_put_context; + /* + * Do not allow to attach to a group in a different + * task or CPU context: + */ + if (group_leader->ctx != ctx) + goto err_put_context; + /* + * Only a group leader can be exclusive or pinned + */ + if (attr.exclusive || attr.pinned) + goto err_put_context; + } + + event = perf_event_alloc(&attr, cpu, ctx, group_leader, + NULL, GFP_KERNEL); + err = PTR_ERR(event); + if (IS_ERR(event)) + goto err_put_context; + + err = anon_inode_getfd("[perf_event]", &perf_fops, event, 0); + if (err < 0) + goto err_free_put_context; + + event_file = fget_light(err, &fput_needed2); + if (!event_file) + goto err_free_put_context; + + if (flags & PERF_FLAG_FD_OUTPUT) { + err = perf_event_set_output(event, group_fd); + if (err) + goto err_fput_free_put_context; + } + + event->filp = event_file; + WARN_ON_ONCE(ctx->parent_ctx); + mutex_lock(&ctx->mutex); + perf_install_in_context(ctx, event, cpu); + ++ctx->generation; + mutex_unlock(&ctx->mutex); + + event->owner = current; + get_task_struct(current); + mutex_lock(¤t->perf_event_mutex); + list_add_tail(&event->owner_entry, ¤t->perf_event_list); + mutex_unlock(¤t->perf_event_mutex); + +err_fput_free_put_context: + fput_light(event_file, fput_needed2); + +err_free_put_context: + if (err < 0) + kfree(event); + +err_put_context: + if (err < 0) + put_ctx(ctx); + + fput_light(group_file, fput_needed); + + return err; +} + +/* + * inherit a event from parent task to child task: + */ +static struct perf_event * +inherit_event(struct perf_event *parent_event, + struct task_struct *parent, + struct perf_event_context *parent_ctx, + struct task_struct *child, + struct perf_event *group_leader, + struct perf_event_context *child_ctx) +{ + struct perf_event *child_event; + + /* + * Instead of creating recursive hierarchies of events, + * we link inherited events back to the original parent, + * which has a filp for sure, which we use as the reference + * count: + */ + if (parent_event->parent) + parent_event = parent_event->parent; + + child_event = perf_event_alloc(&parent_event->attr, + parent_event->cpu, child_ctx, + group_leader, parent_event, + GFP_KERNEL); + if (IS_ERR(child_event)) + return child_event; + get_ctx(child_ctx); + + /* + * Make the child state follow the state of the parent event, + * not its attr.disabled bit. We hold the parent's mutex, + * so we won't race with perf_event_{en, dis}able_family. + */ + if (parent_event->state >= PERF_EVENT_STATE_INACTIVE) + child_event->state = PERF_EVENT_STATE_INACTIVE; + else + child_event->state = PERF_EVENT_STATE_OFF; + + if (parent_event->attr.freq) + child_event->hw.sample_period = parent_event->hw.sample_period; + + /* + * Link it up in the child's context: + */ + add_event_to_ctx(child_event, child_ctx); + + /* + * Get a reference to the parent filp - we will fput it + * when the child event exits. This is safe to do because + * we are in the parent and we know that the filp still + * exists and has a nonzero count: + */ + atomic_long_inc(&parent_event->filp->f_count); + + /* + * Link this into the parent event's child list + */ + WARN_ON_ONCE(parent_event->ctx->parent_ctx); + mutex_lock(&parent_event->child_mutex); + list_add_tail(&child_event->child_list, &parent_event->child_list); + mutex_unlock(&parent_event->child_mutex); + + return child_event; +} + +static int inherit_group(struct perf_event *parent_event, + struct task_struct *parent, + struct perf_event_context *parent_ctx, + struct task_struct *child, + struct perf_event_context *child_ctx) +{ + struct perf_event *leader; + struct perf_event *sub; + struct perf_event *child_ctr; + + leader = inherit_event(parent_event, parent, parent_ctx, + child, NULL, child_ctx); + if (IS_ERR(leader)) + return PTR_ERR(leader); + list_for_each_entry(sub, &parent_event->sibling_list, group_entry) { + child_ctr = inherit_event(sub, parent, parent_ctx, + child, leader, child_ctx); + if (IS_ERR(child_ctr)) + return PTR_ERR(child_ctr); + } + return 0; +} + +static void sync_child_event(struct perf_event *child_event, + struct task_struct *child) +{ + struct perf_event *parent_event = child_event->parent; + u64 child_val; + + if (child_event->attr.inherit_stat) + perf_event_read_event(child_event, child); + + child_val = atomic64_read(&child_event->count); + + /* + * Add back the child's count to the parent's count: + */ + atomic64_add(child_val, &parent_event->count); + atomic64_add(child_event->total_time_enabled, + &parent_event->child_total_time_enabled); + atomic64_add(child_event->total_time_running, + &parent_event->child_total_time_running); + + /* + * Remove this event from the parent's list + */ + WARN_ON_ONCE(parent_event->ctx->parent_ctx); + mutex_lock(&parent_event->child_mutex); + list_del_init(&child_event->child_list); + mutex_unlock(&parent_event->child_mutex); + + /* + * Release the parent event, if this was the last + * reference to it. + */ + fput(parent_event->filp); +} + +static void +__perf_event_exit_task(struct perf_event *child_event, + struct perf_event_context *child_ctx, + struct task_struct *child) +{ + struct perf_event *parent_event; + + update_event_times(child_event); + perf_event_remove_from_context(child_event); + + parent_event = child_event->parent; + /* + * It can happen that parent exits first, and has events + * that are still around due to the child reference. These + * events need to be zapped - but otherwise linger. + */ + if (parent_event) { + sync_child_event(child_event, child); + free_event(child_event); + } +} + +/* + * When a child task exits, feed back event values to parent events. + */ +void perf_event_exit_task(struct task_struct *child) +{ + struct perf_event *child_event, *tmp; + struct perf_event_context *child_ctx; + unsigned long flags; + + if (likely(!child->perf_event_ctxp)) { + perf_event_task(child, NULL, 0); + return; + } + + local_irq_save(flags); + /* + * We can't reschedule here because interrupts are disabled, + * and either child is current or it is a task that can't be + * scheduled, so we are now safe from rescheduling changing + * our context. + */ + child_ctx = child->perf_event_ctxp; + __perf_event_task_sched_out(child_ctx); + + /* + * Take the context lock here so that if find_get_context is + * reading child->perf_event_ctxp, we wait until it has + * incremented the context's refcount before we do put_ctx below. + */ + spin_lock(&child_ctx->lock); + child->perf_event_ctxp = NULL; + /* + * If this context is a clone; unclone it so it can't get + * swapped to another process while we're removing all + * the events from it. + */ + unclone_ctx(child_ctx); + spin_unlock_irqrestore(&child_ctx->lock, flags); + + /* + * Report the task dead after unscheduling the events so that we + * won't get any samples after PERF_RECORD_EXIT. We can however still + * get a few PERF_RECORD_READ events. + */ + perf_event_task(child, child_ctx, 0); + + /* + * We can recurse on the same lock type through: + * + * __perf_event_exit_task() + * sync_child_event() + * fput(parent_event->filp) + * perf_release() + * mutex_lock(&ctx->mutex) + * + * But since its the parent context it won't be the same instance. + */ + mutex_lock_nested(&child_ctx->mutex, SINGLE_DEPTH_NESTING); + +again: + list_for_each_entry_safe(child_event, tmp, &child_ctx->group_list, + group_entry) + __perf_event_exit_task(child_event, child_ctx, child); + + /* + * If the last event was a group event, it will have appended all + * its siblings to the list, but we obtained 'tmp' before that which + * will still point to the list head terminating the iteration. + */ + if (!list_empty(&child_ctx->group_list)) + goto again; + + mutex_unlock(&child_ctx->mutex); + + put_ctx(child_ctx); +} + +/* + * free an unexposed, unused context as created by inheritance by + * init_task below, used by fork() in case of fail. + */ +void perf_event_free_task(struct task_struct *task) +{ + struct perf_event_context *ctx = task->perf_event_ctxp; + struct perf_event *event, *tmp; + + if (!ctx) + return; + + mutex_lock(&ctx->mutex); +again: + list_for_each_entry_safe(event, tmp, &ctx->group_list, group_entry) { + struct perf_event *parent = event->parent; + + if (WARN_ON_ONCE(!parent)) + continue; + + mutex_lock(&parent->child_mutex); + list_del_init(&event->child_list); + mutex_unlock(&parent->child_mutex); + + fput(parent->filp); + + list_del_event(event, ctx); + free_event(event); + } + + if (!list_empty(&ctx->group_list)) + goto again; + + mutex_unlock(&ctx->mutex); + + put_ctx(ctx); +} + +/* + * Initialize the perf_event context in task_struct + */ +int perf_event_init_task(struct task_struct *child) +{ + struct perf_event_context *child_ctx, *parent_ctx; + struct perf_event_context *cloned_ctx; + struct perf_event *event; + struct task_struct *parent = current; + int inherited_all = 1; + int ret = 0; + + child->perf_event_ctxp = NULL; + + mutex_init(&child->perf_event_mutex); + INIT_LIST_HEAD(&child->perf_event_list); + + if (likely(!parent->perf_event_ctxp)) + return 0; + + /* + * This is executed from the parent task context, so inherit + * events that have been marked for cloning. + * First allocate and initialize a context for the child. + */ + + child_ctx = kmalloc(sizeof(struct perf_event_context), GFP_KERNEL); + if (!child_ctx) + return -ENOMEM; + + __perf_event_init_context(child_ctx, child); + child->perf_event_ctxp = child_ctx; + get_task_struct(child); + + /* + * If the parent's context is a clone, pin it so it won't get + * swapped under us. + */ + parent_ctx = perf_pin_task_context(parent); + + /* + * No need to check if parent_ctx != NULL here; since we saw + * it non-NULL earlier, the only reason for it to become NULL + * is if we exit, and since we're currently in the middle of + * a fork we can't be exiting at the same time. + */ + + /* + * Lock the parent list. No need to lock the child - not PID + * hashed yet and not running, so nobody can access it. + */ + mutex_lock(&parent_ctx->mutex); + + /* + * We dont have to disable NMIs - we are only looking at + * the list, not manipulating it: + */ + list_for_each_entry_rcu(event, &parent_ctx->event_list, event_entry) { + if (event != event->group_leader) + continue; + + if (!event->attr.inherit) { + inherited_all = 0; + continue; + } + + ret = inherit_group(event, parent, parent_ctx, + child, child_ctx); + if (ret) { + inherited_all = 0; + break; + } + } + + if (inherited_all) { + /* + * Mark the child context as a clone of the parent + * context, or of whatever the parent is a clone of. + * Note that if the parent is a clone, it could get + * uncloned at any point, but that doesn't matter + * because the list of events and the generation + * count can't have changed since we took the mutex. + */ + cloned_ctx = rcu_dereference(parent_ctx->parent_ctx); + if (cloned_ctx) { + child_ctx->parent_ctx = cloned_ctx; + child_ctx->parent_gen = parent_ctx->parent_gen; + } else { + child_ctx->parent_ctx = parent_ctx; + child_ctx->parent_gen = parent_ctx->generation; + } + get_ctx(child_ctx->parent_ctx); + } + + mutex_unlock(&parent_ctx->mutex); + + perf_unpin_context(parent_ctx); + + return ret; +} + +static void __cpuinit perf_event_init_cpu(int cpu) +{ + struct perf_cpu_context *cpuctx; + + cpuctx = &per_cpu(perf_cpu_context, cpu); + __perf_event_init_context(&cpuctx->ctx, NULL); + + spin_lock(&perf_resource_lock); + cpuctx->max_pertask = perf_max_events - perf_reserved_percpu; + spin_unlock(&perf_resource_lock); + + hw_perf_event_setup(cpu); +} + +#ifdef CONFIG_HOTPLUG_CPU +static void __perf_event_exit_cpu(void *info) +{ + struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); + struct perf_event_context *ctx = &cpuctx->ctx; + struct perf_event *event, *tmp; + + list_for_each_entry_safe(event, tmp, &ctx->group_list, group_entry) + __perf_event_remove_from_context(event); +} +static void perf_event_exit_cpu(int cpu) +{ + struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu); + struct perf_event_context *ctx = &cpuctx->ctx; + + mutex_lock(&ctx->mutex); + smp_call_function_single(cpu, __perf_event_exit_cpu, NULL, 1); + mutex_unlock(&ctx->mutex); +} +#else +static inline void perf_event_exit_cpu(int cpu) { } +#endif + +static int __cpuinit +perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu) +{ + unsigned int cpu = (long)hcpu; + + switch (action) { + + case CPU_UP_PREPARE: + case CPU_UP_PREPARE_FROZEN: + perf_event_init_cpu(cpu); + break; + + case CPU_ONLINE: + case CPU_ONLINE_FROZEN: + hw_perf_event_setup_online(cpu); + break; + + case CPU_DOWN_PREPARE: + case CPU_DOWN_PREPARE_FROZEN: + perf_event_exit_cpu(cpu); + break; + + default: + break; + } + + return NOTIFY_OK; +} + +/* + * This has to have a higher priority than migration_notifier in sched.c. + */ +static struct notifier_block __cpuinitdata perf_cpu_nb = { + .notifier_call = perf_cpu_notify, + .priority = 20, +}; + +void __init perf_event_init(void) +{ + perf_cpu_notify(&perf_cpu_nb, (unsigned long)CPU_UP_PREPARE, + (void *)(long)smp_processor_id()); + perf_cpu_notify(&perf_cpu_nb, (unsigned long)CPU_ONLINE, + (void *)(long)smp_processor_id()); + register_cpu_notifier(&perf_cpu_nb); +} + +static ssize_t perf_show_reserve_percpu(struct sysdev_class *class, char *buf) +{ + return sprintf(buf, "%d\n", perf_reserved_percpu); +} + +static ssize_t +perf_set_reserve_percpu(struct sysdev_class *class, + const char *buf, + size_t count) +{ + struct perf_cpu_context *cpuctx; + unsigned long val; + int err, cpu, mpt; + + err = strict_strtoul(buf, 10, &val); + if (err) + return err; + if (val > perf_max_events) + return -EINVAL; + + spin_lock(&perf_resource_lock); + perf_reserved_percpu = val; + for_each_online_cpu(cpu) { + cpuctx = &per_cpu(perf_cpu_context, cpu); + spin_lock_irq(&cpuctx->ctx.lock); + mpt = min(perf_max_events - cpuctx->ctx.nr_events, + perf_max_events - perf_reserved_percpu); + cpuctx->max_pertask = mpt; + spin_unlock_irq(&cpuctx->ctx.lock); + } + spin_unlock(&perf_resource_lock); + + return count; +} + +static ssize_t perf_show_overcommit(struct sysdev_class *class, char *buf) +{ + return sprintf(buf, "%d\n", perf_overcommit); +} + +static ssize_t +perf_set_overcommit(struct sysdev_class *class, const char *buf, size_t count) +{ + unsigned long val; + int err; + + err = strict_strtoul(buf, 10, &val); + if (err) + return err; + if (val > 1) + return -EINVAL; + + spin_lock(&perf_resource_lock); + perf_overcommit = val; + spin_unlock(&perf_resource_lock); + + return count; +} + +static SYSDEV_CLASS_ATTR( + reserve_percpu, + 0644, + perf_show_reserve_percpu, + perf_set_reserve_percpu + ); + +static SYSDEV_CLASS_ATTR( + overcommit, + 0644, + perf_show_overcommit, + perf_set_overcommit + ); + +static struct attribute *perfclass_attrs[] = { + &attr_reserve_percpu.attr, + &attr_overcommit.attr, + NULL +}; + +static struct attribute_group perfclass_attr_group = { + .attrs = perfclass_attrs, + .name = "perf_events", +}; + +static int __init perf_event_sysfs_init(void) +{ + return sysfs_create_group(&cpu_sysdev_class.kset.kobj, + &perfclass_attr_group); +} +device_initcall(perf_event_sysfs_init); diff --git a/kernel/sched.c b/kernel/sched.c index faf4d463bbff..291c8d213d13 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -39,7 +39,7 @@ #include #include #include -#include +#include #include #include #include @@ -2059,7 +2059,7 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu) if (task_hot(p, old_rq->clock, NULL)) schedstat_inc(p, se.nr_forced2_migrations); #endif - perf_swcounter_event(PERF_COUNT_SW_CPU_MIGRATIONS, + perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS, 1, 1, NULL, 0); } p->se.vruntime -= old_cfsrq->min_vruntime - @@ -2724,7 +2724,7 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev) */ prev_state = prev->state; finish_arch_switch(prev); - perf_counter_task_sched_in(current, cpu_of(rq)); + perf_event_task_sched_in(current, cpu_of(rq)); finish_lock_switch(rq, prev); fire_sched_in_preempt_notifiers(current); @@ -5199,7 +5199,7 @@ void scheduler_tick(void) curr->sched_class->task_tick(rq, curr, 0); spin_unlock(&rq->lock); - perf_counter_task_tick(curr, cpu); + perf_event_task_tick(curr, cpu); #ifdef CONFIG_SMP rq->idle_at_tick = idle_cpu(cpu); @@ -5415,7 +5415,7 @@ need_resched_nonpreemptible: if (likely(prev != next)) { sched_info_switch(prev, next); - perf_counter_task_sched_out(prev, next, cpu); + perf_event_task_sched_out(prev, next, cpu); rq->nr_switches++; rq->curr = next; @@ -7692,7 +7692,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) /* * Register at high priority so that task migration (migrate_all_tasks) * happens before everything else. This has to be lower priority than - * the notifier in the perf_counter subsystem, though. + * the notifier in the perf_event subsystem, though. */ static struct notifier_block __cpuinitdata migration_notifier = { .notifier_call = migration_call, @@ -9549,7 +9549,7 @@ void __init sched_init(void) alloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT); #endif /* SMP */ - perf_counter_init(); + perf_event_init(); scheduler_running = 1; } diff --git a/kernel/sys.c b/kernel/sys.c index b3f1097c76fa..ea5c3bcac881 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -14,7 +14,7 @@ #include #include #include -#include +#include #include #include #include @@ -1511,11 +1511,11 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, case PR_SET_TSC: error = SET_TSC_CTL(arg2); break; - case PR_TASK_PERF_COUNTERS_DISABLE: - error = perf_counter_task_disable(); + case PR_TASK_PERF_EVENTS_DISABLE: + error = perf_event_task_disable(); break; - case PR_TASK_PERF_COUNTERS_ENABLE: - error = perf_counter_task_enable(); + case PR_TASK_PERF_EVENTS_ENABLE: + error = perf_event_task_enable(); break; case PR_GET_TIMERSLACK: error = current->timer_slack_ns; diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index 68320f6b07b5..515bc230ac2a 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c @@ -177,4 +177,4 @@ cond_syscall(sys_eventfd); cond_syscall(sys_eventfd2); /* performance counters: */ -cond_syscall(sys_perf_counter_open); +cond_syscall(sys_perf_event_open); diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 1a631ba684a4..6ba49c7cb128 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -50,7 +50,7 @@ #include #include #include -#include +#include #include #include @@ -964,28 +964,28 @@ static struct ctl_table kern_table[] = { .child = slow_work_sysctls, }, #endif -#ifdef CONFIG_PERF_COUNTERS +#ifdef CONFIG_PERF_EVENTS { .ctl_name = CTL_UNNUMBERED, - .procname = "perf_counter_paranoid", - .data = &sysctl_perf_counter_paranoid, - .maxlen = sizeof(sysctl_perf_counter_paranoid), + .procname = "perf_event_paranoid", + .data = &sysctl_perf_event_paranoid, + .maxlen = sizeof(sysctl_perf_event_paranoid), .mode = 0644, .proc_handler = &proc_dointvec, }, { .ctl_name = CTL_UNNUMBERED, - .procname = "perf_counter_mlock_kb", - .data = &sysctl_perf_counter_mlock, - .maxlen = sizeof(sysctl_perf_counter_mlock), + .procname = "perf_event_mlock_kb", + .data = &sysctl_perf_event_mlock, + .maxlen = sizeof(sysctl_perf_event_mlock), .mode = 0644, .proc_handler = &proc_dointvec, }, { .ctl_name = CTL_UNNUMBERED, - .procname = "perf_counter_max_sample_rate", - .data = &sysctl_perf_counter_sample_rate, - .maxlen = sizeof(sysctl_perf_counter_sample_rate), + .procname = "perf_event_max_sample_rate", + .data = &sysctl_perf_event_sample_rate, + .maxlen = sizeof(sysctl_perf_event_sample_rate), .mode = 0644, .proc_handler = &proc_dointvec, }, diff --git a/kernel/timer.c b/kernel/timer.c index bbb51074680e..811e5c391456 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -37,7 +37,7 @@ #include #include #include -#include +#include #include #include @@ -1187,7 +1187,7 @@ static void run_timer_softirq(struct softirq_action *h) { struct tvec_base *base = __get_cpu_var(tvec_bases); - perf_counter_do_pending(); + perf_event_do_pending(); hrtimer_run_pending(); diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index 8712ce3c6a0e..233f3483ac83 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c @@ -2,7 +2,7 @@ #include #include #include -#include +#include #include #include "trace_output.h" @@ -414,7 +414,7 @@ static void prof_syscall_enter(struct pt_regs *regs, long id) rec->nr = syscall_nr; syscall_get_arguments(current, regs, 0, sys_data->nb_args, (unsigned long *)&rec->args); - perf_tpcounter_event(sys_data->enter_id, 0, 1, rec, size); + perf_tp_event(sys_data->enter_id, 0, 1, rec, size); } while(0); } @@ -476,7 +476,7 @@ static void prof_syscall_exit(struct pt_regs *regs, long ret) rec.nr = syscall_nr; rec.ret = syscall_get_return_value(current, regs); - perf_tpcounter_event(sys_data->exit_id, 0, 1, &rec, sizeof(rec)); + perf_tp_event(sys_data->exit_id, 0, 1, &rec, sizeof(rec)); } int reg_prof_syscall_exit(char *name) diff --git a/mm/mmap.c b/mm/mmap.c index 26892e346d8f..376492ed08f4 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -28,7 +28,7 @@ #include #include #include -#include +#include #include #include @@ -1220,7 +1220,7 @@ munmap_back: if (correct_wcount) atomic_inc(&inode->i_writecount); out: - perf_counter_mmap(vma); + perf_event_mmap(vma); mm->total_vm += len >> PAGE_SHIFT; vm_stat_account(mm, vm_flags, file, len >> PAGE_SHIFT); @@ -2308,7 +2308,7 @@ int install_special_mapping(struct mm_struct *mm, mm->total_vm += len >> PAGE_SHIFT; - perf_counter_mmap(vma); + perf_event_mmap(vma); return 0; } diff --git a/mm/mprotect.c b/mm/mprotect.c index d80311baeb2d..8bc969d8112d 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -23,7 +23,7 @@ #include #include #include -#include +#include #include #include #include @@ -300,7 +300,7 @@ SYSCALL_DEFINE3(mprotect, unsigned long, start, size_t, len, error = mprotect_fixup(vma, &prev, nstart, tmp, newflags); if (error) goto out; - perf_counter_mmap(vma); + perf_event_mmap(vma); nstart = tmp; if (nstart < prev->vm_end) diff --git a/tools/perf/Makefile b/tools/perf/Makefile index 0aba8b6e9c54..b5f1953b6144 100644 --- a/tools/perf/Makefile +++ b/tools/perf/Makefile @@ -318,7 +318,7 @@ export PERL_PATH LIB_FILE=libperf.a -LIB_H += ../../include/linux/perf_counter.h +LIB_H += ../../include/linux/perf_event.h LIB_H += ../../include/linux/rbtree.h LIB_H += ../../include/linux/list.h LIB_H += util/include/linux/list.h diff --git a/tools/perf/builtin-annotate.c b/tools/perf/builtin-annotate.c index 043d85b7e254..1ec741615814 100644 --- a/tools/perf/builtin-annotate.c +++ b/tools/perf/builtin-annotate.c @@ -505,7 +505,7 @@ process_sample_event(event_t *event, unsigned long offset, unsigned long head) return -1; } - if (event->header.misc & PERF_EVENT_MISC_KERNEL) { + if (event->header.misc & PERF_RECORD_MISC_KERNEL) { show = SHOW_KERNEL; level = 'k'; @@ -513,7 +513,7 @@ process_sample_event(event_t *event, unsigned long offset, unsigned long head) dump_printf(" ...... dso: %s\n", dso->name); - } else if (event->header.misc & PERF_EVENT_MISC_USER) { + } else if (event->header.misc & PERF_RECORD_MISC_USER) { show = SHOW_USER; level = '.'; @@ -565,7 +565,7 @@ process_mmap_event(event_t *event, unsigned long offset, unsigned long head) thread = threads__findnew(event->mmap.pid, &threads, &last_match); - dump_printf("%p [%p]: PERF_EVENT_MMAP %d: [%p(%p) @ %p]: %s\n", + dump_printf("%p [%p]: PERF_RECORD_MMAP %d: [%p(%p) @ %p]: %s\n", (void *)(offset + head), (void *)(long)(event->header.size), event->mmap.pid, @@ -575,7 +575,7 @@ process_mmap_event(event_t *event, unsigned long offset, unsigned long head) event->mmap.filename); if (thread == NULL || map == NULL) { - dump_printf("problem processing PERF_EVENT_MMAP, skipping event.\n"); + dump_printf("problem processing PERF_RECORD_MMAP, skipping event.\n"); return 0; } @@ -591,14 +591,14 @@ process_comm_event(event_t *event, unsigned long offset, unsigned long head) struct thread *thread; thread = threads__findnew(event->comm.pid, &threads, &last_match); - dump_printf("%p [%p]: PERF_EVENT_COMM: %s:%d\n", + dump_printf("%p [%p]: PERF_RECORD_COMM: %s:%d\n", (void *)(offset + head), (void *)(long)(event->header.size), event->comm.comm, event->comm.pid); if (thread == NULL || thread__set_comm(thread, event->comm.comm)) { - dump_printf("problem processing PERF_EVENT_COMM, skipping event.\n"); + dump_printf("problem processing PERF_RECORD_COMM, skipping event.\n"); return -1; } total_comm++; @@ -614,7 +614,7 @@ process_fork_event(event_t *event, unsigned long offset, unsigned long head) thread = threads__findnew(event->fork.pid, &threads, &last_match); parent = threads__findnew(event->fork.ppid, &threads, &last_match); - dump_printf("%p [%p]: PERF_EVENT_FORK: %d:%d\n", + dump_printf("%p [%p]: PERF_RECORD_FORK: %d:%d\n", (void *)(offset + head), (void *)(long)(event->header.size), event->fork.pid, event->fork.ppid); @@ -627,7 +627,7 @@ process_fork_event(event_t *event, unsigned long offset, unsigned long head) return 0; if (!thread || !parent || thread__fork(thread, parent)) { - dump_printf("problem processing PERF_EVENT_FORK, skipping event.\n"); + dump_printf("problem processing PERF_RECORD_FORK, skipping event.\n"); return -1; } total_fork++; @@ -639,23 +639,23 @@ static int process_event(event_t *event, unsigned long offset, unsigned long head) { switch (event->header.type) { - case PERF_EVENT_SAMPLE: + case PERF_RECORD_SAMPLE: return process_sample_event(event, offset, head); - case PERF_EVENT_MMAP: + case PERF_RECORD_MMAP: return process_mmap_event(event, offset, head); - case PERF_EVENT_COMM: + case PERF_RECORD_COMM: return process_comm_event(event, offset, head); - case PERF_EVENT_FORK: + case PERF_RECORD_FORK: return process_fork_event(event, offset, head); /* * We dont process them right now but they are fine: */ - case PERF_EVENT_THROTTLE: - case PERF_EVENT_UNTHROTTLE: + case PERF_RECORD_THROTTLE: + case PERF_RECORD_UNTHROTTLE: return 0; default: diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c index 2459e5a22ed8..a5a050af8e7d 100644 --- a/tools/perf/builtin-record.c +++ b/tools/perf/builtin-record.c @@ -77,7 +77,7 @@ static struct mmap_data mmap_array[MAX_NR_CPUS][MAX_COUNTERS]; static unsigned long mmap_read_head(struct mmap_data *md) { - struct perf_counter_mmap_page *pc = md->base; + struct perf_event_mmap_page *pc = md->base; long head; head = pc->data_head; @@ -88,7 +88,7 @@ static unsigned long mmap_read_head(struct mmap_data *md) static void mmap_write_tail(struct mmap_data *md, unsigned long tail) { - struct perf_counter_mmap_page *pc = md->base; + struct perf_event_mmap_page *pc = md->base; /* * ensure all reads are done before we write the tail out. @@ -233,7 +233,7 @@ static pid_t pid_synthesize_comm_event(pid_t pid, int full) } } - comm_ev.header.type = PERF_EVENT_COMM; + comm_ev.header.type = PERF_RECORD_COMM; size = ALIGN(size, sizeof(u64)); comm_ev.header.size = sizeof(comm_ev) - (sizeof(comm_ev.comm) - size); @@ -288,7 +288,7 @@ static void pid_synthesize_mmap_samples(pid_t pid, pid_t tgid) while (1) { char bf[BUFSIZ], *pbf = bf; struct mmap_event mmap_ev = { - .header = { .type = PERF_EVENT_MMAP }, + .header = { .type = PERF_RECORD_MMAP }, }; int n; size_t size; @@ -355,7 +355,7 @@ static void synthesize_all(void) static int group_fd; -static struct perf_header_attr *get_header_attr(struct perf_counter_attr *a, int nr) +static struct perf_header_attr *get_header_attr(struct perf_event_attr *a, int nr) { struct perf_header_attr *h_attr; @@ -371,7 +371,7 @@ static struct perf_header_attr *get_header_attr(struct perf_counter_attr *a, int static void create_counter(int counter, int cpu, pid_t pid) { - struct perf_counter_attr *attr = attrs + counter; + struct perf_event_attr *attr = attrs + counter; struct perf_header_attr *h_attr; int track = !counter; /* only the first counter needs these */ struct { @@ -417,7 +417,7 @@ static void create_counter(int counter, int cpu, pid_t pid) attr->disabled = 1; try_again: - fd[nr_cpu][counter] = sys_perf_counter_open(attr, pid, cpu, group_fd, 0); + fd[nr_cpu][counter] = sys_perf_event_open(attr, pid, cpu, group_fd, 0); if (fd[nr_cpu][counter] < 0) { int err = errno; @@ -444,7 +444,7 @@ try_again: printf("\n"); error("perfcounter syscall returned with %d (%s)\n", fd[nr_cpu][counter], strerror(err)); - die("No CONFIG_PERF_COUNTERS=y kernel support configured?\n"); + die("No CONFIG_PERF_EVENTS=y kernel support configured?\n"); exit(-1); } @@ -478,7 +478,7 @@ try_again: if (multiplex && fd[nr_cpu][counter] != multiplex_fd) { int ret; - ret = ioctl(fd[nr_cpu][counter], PERF_COUNTER_IOC_SET_OUTPUT, multiplex_fd); + ret = ioctl(fd[nr_cpu][counter], PERF_EVENT_IOC_SET_OUTPUT, multiplex_fd); assert(ret != -1); } else { event_array[nr_poll].fd = fd[nr_cpu][counter]; @@ -496,7 +496,7 @@ try_again: } } - ioctl(fd[nr_cpu][counter], PERF_COUNTER_IOC_ENABLE); + ioctl(fd[nr_cpu][counter], PERF_EVENT_IOC_ENABLE); } static void open_counters(int cpu, pid_t pid) @@ -642,7 +642,7 @@ static int __cmd_record(int argc, const char **argv) if (done) { for (i = 0; i < nr_cpu; i++) { for (counter = 0; counter < nr_counters; counter++) - ioctl(fd[i][counter], PERF_COUNTER_IOC_DISABLE); + ioctl(fd[i][counter], PERF_EVENT_IOC_DISABLE); } } } diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c index cdf9a8d27bb9..19669c20088e 100644 --- a/tools/perf/builtin-report.c +++ b/tools/perf/builtin-report.c @@ -1121,7 +1121,7 @@ process_sample_event(event_t *event, unsigned long offset, unsigned long head) more_data += sizeof(u64); } - dump_printf("%p [%p]: PERF_EVENT_SAMPLE (IP, %d): %d/%d: %p period: %Ld\n", + dump_printf("%p [%p]: PERF_RECORD_SAMPLE (IP, %d): %d/%d: %p period: %Ld\n", (void *)(offset + head), (void *)(long)(event->header.size), event->header.misc, @@ -1158,9 +1158,9 @@ process_sample_event(event_t *event, unsigned long offset, unsigned long head) if (comm_list && !strlist__has_entry(comm_list, thread->comm)) return 0; - cpumode = event->header.misc & PERF_EVENT_MISC_CPUMODE_MASK; + cpumode = event->header.misc & PERF_RECORD_MISC_CPUMODE_MASK; - if (cpumode == PERF_EVENT_MISC_KERNEL) { + if (cpumode == PERF_RECORD_MISC_KERNEL) { show = SHOW_KERNEL; level = 'k'; @@ -1168,7 +1168,7 @@ process_sample_event(event_t *event, unsigned long offset, unsigned long head) dump_printf(" ...... dso: %s\n", dso->name); - } else if (cpumode == PERF_EVENT_MISC_USER) { + } else if (cpumode == PERF_RECORD_MISC_USER) { show = SHOW_USER; level = '.'; @@ -1210,7 +1210,7 @@ process_mmap_event(event_t *event, unsigned long offset, unsigned long head) thread = threads__findnew(event->mmap.pid, &threads, &last_match); - dump_printf("%p [%p]: PERF_EVENT_MMAP %d/%d: [%p(%p) @ %p]: %s\n", + dump_printf("%p [%p]: PERF_RECORD_MMAP %d/%d: [%p(%p) @ %p]: %s\n", (void *)(offset + head), (void *)(long)(event->header.size), event->mmap.pid, @@ -1221,7 +1221,7 @@ process_mmap_event(event_t *event, unsigned long offset, unsigned long head) event->mmap.filename); if (thread == NULL || map == NULL) { - dump_printf("problem processing PERF_EVENT_MMAP, skipping event.\n"); + dump_printf("problem processing PERF_RECORD_MMAP, skipping event.\n"); return 0; } @@ -1238,14 +1238,14 @@ process_comm_event(event_t *event, unsigned long offset, unsigned long head) thread = threads__findnew(event->comm.pid, &threads, &last_match); - dump_printf("%p [%p]: PERF_EVENT_COMM: %s:%d\n", + dump_printf("%p [%p]: PERF_RECORD_COMM: %s:%d\n", (void *)(offset + head), (void *)(long)(event->header.size), event->comm.comm, event->comm.pid); if (thread == NULL || thread__set_comm_adjust(thread, event->comm.comm)) { - dump_printf("problem processing PERF_EVENT_COMM, skipping event.\n"); + dump_printf("problem processing PERF_RECORD_COMM, skipping event.\n"); return -1; } total_comm++; @@ -1262,10 +1262,10 @@ process_task_event(event_t *event, unsigned long offset, unsigned long head) thread = threads__findnew(event->fork.pid, &threads, &last_match); parent = threads__findnew(event->fork.ppid, &threads, &last_match); - dump_printf("%p [%p]: PERF_EVENT_%s: (%d:%d):(%d:%d)\n", + dump_printf("%p [%p]: PERF_RECORD_%s: (%d:%d):(%d:%d)\n", (void *)(offset + head), (void *)(long)(event->header.size), - event->header.type == PERF_EVENT_FORK ? "FORK" : "EXIT", + event->header.type == PERF_RECORD_FORK ? "FORK" : "EXIT", event->fork.pid, event->fork.tid, event->fork.ppid, event->fork.ptid); @@ -1276,11 +1276,11 @@ process_task_event(event_t *event, unsigned long offset, unsigned long head) if (thread == parent) return 0; - if (event->header.type == PERF_EVENT_EXIT) + if (event->header.type == PERF_RECORD_EXIT) return 0; if (!thread || !parent || thread__fork(thread, parent)) { - dump_printf("problem processing PERF_EVENT_FORK, skipping event.\n"); + dump_printf("problem processing PERF_RECORD_FORK, skipping event.\n"); return -1; } total_fork++; @@ -1291,7 +1291,7 @@ process_task_event(event_t *event, unsigned long offset, unsigned long head) static int process_lost_event(event_t *event, unsigned long offset, unsigned long head) { - dump_printf("%p [%p]: PERF_EVENT_LOST: id:%Ld: lost:%Ld\n", + dump_printf("%p [%p]: PERF_RECORD_LOST: id:%Ld: lost:%Ld\n", (void *)(offset + head), (void *)(long)(event->header.size), event->lost.id, @@ -1305,7 +1305,7 @@ process_lost_event(event_t *event, unsigned long offset, unsigned long head) static int process_read_event(event_t *event, unsigned long offset, unsigned long head) { - struct perf_counter_attr *attr; + struct perf_event_attr *attr; attr = perf_header__find_attr(event->read.id, header); @@ -1319,7 +1319,7 @@ process_read_event(event_t *event, unsigned long offset, unsigned long head) event->read.value); } - dump_printf("%p [%p]: PERF_EVENT_READ: %d %d %s %Lu\n", + dump_printf("%p [%p]: PERF_RECORD_READ: %d %d %s %Lu\n", (void *)(offset + head), (void *)(long)(event->header.size), event->read.pid, @@ -1337,31 +1337,31 @@ process_event(event_t *event, unsigned long offset, unsigned long head) trace_event(event); switch (event->header.type) { - case PERF_EVENT_SAMPLE: + case PERF_RECORD_SAMPLE: return process_sample_event(event, offset, head); - case PERF_EVENT_MMAP: + case PERF_RECORD_MMAP: return process_mmap_event(event, offset, head); - case PERF_EVENT_COMM: + case PERF_RECORD_COMM: return process_comm_event(event, offset, head); - case PERF_EVENT_FORK: - case PERF_EVENT_EXIT: + case PERF_RECORD_FORK: + case PERF_RECORD_EXIT: return process_task_event(event, offset, head); - case PERF_EVENT_LOST: + case PERF_RECORD_LOST: return process_lost_event(event, offset, head); - case PERF_EVENT_READ: + case PERF_RECORD_READ: return process_read_event(event, offset, head); /* * We dont process them right now but they are fine: */ - case PERF_EVENT_THROTTLE: - case PERF_EVENT_UNTHROTTLE: + case PERF_RECORD_THROTTLE: + case PERF_RECORD_UNTHROTTLE: return 0; default: diff --git a/tools/perf/builtin-sched.c b/tools/perf/builtin-sched.c index 275d79c6627a..ea9c15c0cdfe 100644 --- a/tools/perf/builtin-sched.c +++ b/tools/perf/builtin-sched.c @@ -1573,7 +1573,7 @@ process_sample_event(event_t *event, unsigned long offset, unsigned long head) more_data += sizeof(u64); } - dump_printf("%p [%p]: PERF_EVENT_SAMPLE (IP, %d): %d/%d: %p period: %Ld\n", + dump_printf("%p [%p]: PERF_RECORD_SAMPLE (IP, %d): %d/%d: %p period: %Ld\n", (void *)(offset + head), (void *)(long)(event->header.size), event->header.misc, @@ -1589,9 +1589,9 @@ process_sample_event(event_t *event, unsigned long offset, unsigned long head) return -1; } - cpumode = event->header.misc & PERF_EVENT_MISC_CPUMODE_MASK; + cpumode = event->header.misc & PERF_RECORD_MISC_CPUMODE_MASK; - if (cpumode == PERF_EVENT_MISC_KERNEL) { + if (cpumode == PERF_RECORD_MISC_KERNEL) { show = SHOW_KERNEL; level = 'k'; @@ -1599,7 +1599,7 @@ process_sample_event(event_t *event, unsigned long offset, unsigned long head) dump_printf(" ...... dso: %s\n", dso->name); - } else if (cpumode == PERF_EVENT_MISC_USER) { + } else if (cpumode == PERF_RECORD_MISC_USER) { show = SHOW_USER; level = '.'; @@ -1626,23 +1626,23 @@ process_event(event_t *event, unsigned long offset, unsigned long head) nr_events++; switch (event->header.type) { - case PERF_EVENT_MMAP: + case PERF_RECORD_MMAP: return 0; - case PERF_EVENT_LOST: + case PERF_RECORD_LOST: nr_lost_chunks++; nr_lost_events += event->lost.lost; return 0; - case PERF_EVENT_COMM: + case PERF_RECORD_COMM: return process_comm_event(event, offset, head); - case PERF_EVENT_EXIT ... PERF_EVENT_READ: + case PERF_RECORD_EXIT ... PERF_RECORD_READ: return 0; - case PERF_EVENT_SAMPLE: + case PERF_RECORD_SAMPLE: return process_sample_event(event, offset, head); - case PERF_EVENT_MAX: + case PERF_RECORD_MAX: default: return -1; } diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c index 61b828236c11..16af2d82e858 100644 --- a/tools/perf/builtin-stat.c +++ b/tools/perf/builtin-stat.c @@ -48,7 +48,7 @@ #include #include -static struct perf_counter_attr default_attrs[] = { +static struct perf_event_attr default_attrs[] = { { .type = PERF_TYPE_SOFTWARE, .config = PERF_COUNT_SW_TASK_CLOCK }, { .type = PERF_TYPE_SOFTWARE, .config = PERF_COUNT_SW_CONTEXT_SWITCHES}, @@ -130,11 +130,11 @@ struct stats runtime_cycles_stats; attrs[counter].config == PERF_COUNT_##c) #define ERR_PERF_OPEN \ -"Error: counter %d, sys_perf_counter_open() syscall returned with %d (%s)\n" +"Error: counter %d, sys_perf_event_open() syscall returned with %d (%s)\n" static void create_perf_stat_counter(int counter, int pid) { - struct perf_counter_attr *attr = attrs + counter; + struct perf_event_attr *attr = attrs + counter; if (scale) attr->read_format = PERF_FORMAT_TOTAL_TIME_ENABLED | @@ -144,7 +144,7 @@ static void create_perf_stat_counter(int counter, int pid) unsigned int cpu; for (cpu = 0; cpu < nr_cpus; cpu++) { - fd[cpu][counter] = sys_perf_counter_open(attr, -1, cpu, -1, 0); + fd[cpu][counter] = sys_perf_event_open(attr, -1, cpu, -1, 0); if (fd[cpu][counter] < 0 && verbose) fprintf(stderr, ERR_PERF_OPEN, counter, fd[cpu][counter], strerror(errno)); @@ -154,7 +154,7 @@ static void create_perf_stat_counter(int counter, int pid) attr->disabled = 1; attr->enable_on_exec = 1; - fd[0][counter] = sys_perf_counter_open(attr, pid, -1, -1, 0); + fd[0][counter] = sys_perf_event_open(attr, pid, -1, -1, 0); if (fd[0][counter] < 0 && verbose) fprintf(stderr, ERR_PERF_OPEN, counter, fd[0][counter], strerror(errno)); diff --git a/tools/perf/builtin-timechart.c b/tools/perf/builtin-timechart.c index 600406396274..4405681b3134 100644 --- a/tools/perf/builtin-timechart.c +++ b/tools/perf/builtin-timechart.c @@ -937,21 +937,21 @@ process_event(event_t *event) switch (event->header.type) { - case PERF_EVENT_COMM: + case PERF_RECORD_COMM: return process_comm_event(event); - case PERF_EVENT_FORK: + case PERF_RECORD_FORK: return process_fork_event(event); - case PERF_EVENT_EXIT: + case PERF_RECORD_EXIT: return process_exit_event(event); - case PERF_EVENT_SAMPLE: + case PERF_RECORD_SAMPLE: return queue_sample_event(event); /* * We dont process them right now but they are fine: */ - case PERF_EVENT_MMAP: - case PERF_EVENT_THROTTLE: - case PERF_EVENT_UNTHROTTLE: + case PERF_RECORD_MMAP: + case PERF_RECORD_THROTTLE: + case PERF_RECORD_UNTHROTTLE: return 0; default: diff --git a/tools/perf/builtin-top.c b/tools/perf/builtin-top.c index 4002ccb36750..1ca88896eee4 100644 --- a/tools/perf/builtin-top.c +++ b/tools/perf/builtin-top.c @@ -901,7 +901,7 @@ struct mmap_data { static unsigned int mmap_read_head(struct mmap_data *md) { - struct perf_counter_mmap_page *pc = md->base; + struct perf_event_mmap_page *pc = md->base; int head; head = pc->data_head; @@ -977,9 +977,9 @@ static void mmap_read_counter(struct mmap_data *md) old += size; - if (event->header.type == PERF_EVENT_SAMPLE) { + if (event->header.type == PERF_RECORD_SAMPLE) { int user = - (event->header.misc & PERF_EVENT_MISC_CPUMODE_MASK) == PERF_EVENT_MISC_USER; + (event->header.misc & PERF_RECORD_MISC_CPUMODE_MASK) == PERF_RECORD_MISC_USER; process_event(event->ip.ip, md->counter, user); } } @@ -1005,7 +1005,7 @@ int group_fd; static void start_counter(int i, int counter) { - struct perf_counter_attr *attr; + struct perf_event_attr *attr; int cpu; cpu = profile_cpu; @@ -1019,7 +1019,7 @@ static void start_counter(int i, int counter) attr->inherit = (cpu < 0) && inherit; try_again: - fd[i][counter] = sys_perf_counter_open(attr, target_pid, cpu, group_fd, 0); + fd[i][counter] = sys_perf_event_open(attr, target_pid, cpu, group_fd, 0); if (fd[i][counter] < 0) { int err = errno; @@ -1044,7 +1044,7 @@ try_again: printf("\n"); error("perfcounter syscall returned with %d (%s)\n", fd[i][counter], strerror(err)); - die("No CONFIG_PERF_COUNTERS=y kernel support configured?\n"); + die("No CONFIG_PERF_EVENTS=y kernel support configured?\n"); exit(-1); } assert(fd[i][counter] >= 0); diff --git a/tools/perf/builtin-trace.c b/tools/perf/builtin-trace.c index 914ab366e369..e9d256e2f47d 100644 --- a/tools/perf/builtin-trace.c +++ b/tools/perf/builtin-trace.c @@ -35,14 +35,14 @@ process_comm_event(event_t *event, unsigned long offset, unsigned long head) thread = threads__findnew(event->comm.pid, &threads, &last_match); - dump_printf("%p [%p]: PERF_EVENT_COMM: %s:%d\n", + dump_printf("%p [%p]: PERF_RECORD_COMM: %s:%d\n", (void *)(offset + head), (void *)(long)(event->header.size), event->comm.comm, event->comm.pid); if (thread == NULL || thread__set_comm(thread, event->comm.comm)) { - dump_printf("problem processing PERF_EVENT_COMM, skipping event.\n"); + dump_printf("problem processing PERF_RECORD_COMM, skipping event.\n"); return -1; } total_comm++; @@ -82,7 +82,7 @@ process_sample_event(event_t *event, unsigned long offset, unsigned long head) more_data += sizeof(u64); } - dump_printf("%p [%p]: PERF_EVENT_SAMPLE (IP, %d): %d/%d: %p period: %Ld\n", + dump_printf("%p [%p]: PERF_RECORD_SAMPLE (IP, %d): %d/%d: %p period: %Ld\n", (void *)(offset + head), (void *)(long)(event->header.size), event->header.misc, @@ -98,9 +98,9 @@ process_sample_event(event_t *event, unsigned long offset, unsigned long head) return -1; } - cpumode = event->header.misc & PERF_EVENT_MISC_CPUMODE_MASK; + cpumode = event->header.misc & PERF_RECORD_MISC_CPUMODE_MASK; - if (cpumode == PERF_EVENT_MISC_KERNEL) { + if (cpumode == PERF_RECORD_MISC_KERNEL) { show = SHOW_KERNEL; level = 'k'; @@ -108,7 +108,7 @@ process_sample_event(event_t *event, unsigned long offset, unsigned long head) dump_printf(" ...... dso: %s\n", dso->name); - } else if (cpumode == PERF_EVENT_MISC_USER) { + } else if (cpumode == PERF_RECORD_MISC_USER) { show = SHOW_USER; level = '.'; @@ -146,19 +146,19 @@ process_event(event_t *event, unsigned long offset, unsigned long head) trace_event(event); switch (event->header.type) { - case PERF_EVENT_MMAP ... PERF_EVENT_LOST: + case PERF_RECORD_MMAP ... PERF_RECORD_LOST: return 0; - case PERF_EVENT_COMM: + case PERF_RECORD_COMM: return process_comm_event(event, offset, head); - case PERF_EVENT_EXIT ... PERF_EVENT_READ: + case PERF_RECORD_EXIT ... PERF_RECORD_READ: return 0; - case PERF_EVENT_SAMPLE: + case PERF_RECORD_SAMPLE: return process_sample_event(event, offset, head); - case PERF_EVENT_MAX: + case PERF_RECORD_MAX: default: return -1; } diff --git a/tools/perf/design.txt b/tools/perf/design.txt index f71e0d245cba..f1946d107b10 100644 --- a/tools/perf/design.txt +++ b/tools/perf/design.txt @@ -18,10 +18,10 @@ underlying hardware counters. Performance counters are accessed via special file descriptors. There's one file descriptor per virtual counter used. -The special file descriptor is opened via the perf_counter_open() +The special file descriptor is opened via the perf_event_open() system call: - int sys_perf_counter_open(struct perf_counter_hw_event *hw_event_uptr, + int sys_perf_event_open(struct perf_event_hw_event *hw_event_uptr, pid_t pid, int cpu, int group_fd, unsigned long flags); @@ -32,9 +32,9 @@ can be used to set the blocking mode, etc. Multiple counters can be kept open at a time, and the counters can be poll()ed. -When creating a new counter fd, 'perf_counter_hw_event' is: +When creating a new counter fd, 'perf_event_hw_event' is: -struct perf_counter_hw_event { +struct perf_event_hw_event { /* * The MSB of the config word signifies if the rest contains cpu * specific (raw) counter configuration data, if unset, the next @@ -93,7 +93,7 @@ specified by 'event_id': /* * Generalized performance counter event types, used by the hw_event.event_id - * parameter of the sys_perf_counter_open() syscall: + * parameter of the sys_perf_event_open() syscall: */ enum hw_event_ids { /* @@ -159,7 +159,7 @@ in size. * reads on the counter should return the indicated quantities, * in increasing order of bit value, after the counter value. */ -enum perf_counter_read_format { +enum perf_event_read_format { PERF_FORMAT_TOTAL_TIME_ENABLED = 1, PERF_FORMAT_TOTAL_TIME_RUNNING = 2, }; @@ -178,7 +178,7 @@ interrupt: * Bits that can be set in hw_event.record_type to request information * in the overflow packets. */ -enum perf_counter_record_format { +enum perf_event_record_format { PERF_RECORD_IP = 1U << 0, PERF_RECORD_TID = 1U << 1, PERF_RECORD_TIME = 1U << 2, @@ -228,7 +228,7 @@ these events are recorded in the ring-buffer (see below). The 'comm' bit allows tracking of process comm data on process creation. This too is recorded in the ring-buffer (see below). -The 'pid' parameter to the perf_counter_open() system call allows the +The 'pid' parameter to the perf_event_open() system call allows the counter to be specific to a task: pid == 0: if the pid parameter is zero, the counter is attached to the @@ -258,7 +258,7 @@ The 'flags' parameter is currently unused and must be zero. The 'group_fd' parameter allows counter "groups" to be set up. A counter group has one counter which is the group "leader". The leader -is created first, with group_fd = -1 in the perf_counter_open call +is created first, with group_fd = -1 in the perf_event_open call that creates it. The rest of the group members are created subsequently, with group_fd giving the fd of the group leader. (A single counter on its own is created with group_fd = -1 and is @@ -277,13 +277,13 @@ tracking are logged into a ring-buffer. This ring-buffer is created and accessed through mmap(). The mmap size should be 1+2^n pages, where the first page is a meta-data page -(struct perf_counter_mmap_page) that contains various bits of information such +(struct perf_event_mmap_page) that contains various bits of information such as where the ring-buffer head is. /* * Structure of the page that can be mapped via mmap */ -struct perf_counter_mmap_page { +struct perf_event_mmap_page { __u32 version; /* version number of this structure */ __u32 compat_version; /* lowest version this is compat with */ @@ -317,7 +317,7 @@ struct perf_counter_mmap_page { * Control data for the mmap() data buffer. * * User-space reading this value should issue an rmb(), on SMP capable - * platforms, after reading this value -- see perf_counter_wakeup(). + * platforms, after reading this value -- see perf_event_wakeup(). */ __u32 data_head; /* head in the data section */ }; @@ -327,9 +327,9 @@ NOTE: the hw-counter userspace bits are arch specific and are currently only The following 2^n pages are the ring-buffer which contains events of the form: -#define PERF_EVENT_MISC_KERNEL (1 << 0) -#define PERF_EVENT_MISC_USER (1 << 1) -#define PERF_EVENT_MISC_OVERFLOW (1 << 2) +#define PERF_RECORD_MISC_KERNEL (1 << 0) +#define PERF_RECORD_MISC_USER (1 << 1) +#define PERF_RECORD_MISC_OVERFLOW (1 << 2) struct perf_event_header { __u32 type; @@ -353,8 +353,8 @@ enum perf_event_type { * char filename[]; * }; */ - PERF_EVENT_MMAP = 1, - PERF_EVENT_MUNMAP = 2, + PERF_RECORD_MMAP = 1, + PERF_RECORD_MUNMAP = 2, /* * struct { @@ -364,10 +364,10 @@ enum perf_event_type { * char comm[]; * }; */ - PERF_EVENT_COMM = 3, + PERF_RECORD_COMM = 3, /* - * When header.misc & PERF_EVENT_MISC_OVERFLOW the event_type field + * When header.misc & PERF_RECORD_MISC_OVERFLOW the event_type field * will be PERF_RECORD_* * * struct { @@ -397,7 +397,7 @@ Notification of new events is possible through poll()/select()/epoll() and fcntl() managing signals. Normally a notification is generated for every page filled, however one can -additionally set perf_counter_hw_event.wakeup_events to generate one every +additionally set perf_event_hw_event.wakeup_events to generate one every so many counter overflow events. Future work will include a splice() interface to the ring-buffer. @@ -409,11 +409,11 @@ events but does continue to exist and maintain its count value. An individual counter or counter group can be enabled with - ioctl(fd, PERF_COUNTER_IOC_ENABLE); + ioctl(fd, PERF_EVENT_IOC_ENABLE); or disabled with - ioctl(fd, PERF_COUNTER_IOC_DISABLE); + ioctl(fd, PERF_EVENT_IOC_DISABLE); Enabling or disabling the leader of a group enables or disables the whole group; that is, while the group leader is disabled, none of the @@ -424,16 +424,16 @@ other counter. Additionally, non-inherited overflow counters can use - ioctl(fd, PERF_COUNTER_IOC_REFRESH, nr); + ioctl(fd, PERF_EVENT_IOC_REFRESH, nr); to enable a counter for 'nr' events, after which it gets disabled again. A process can enable or disable all the counter groups that are attached to it, using prctl: - prctl(PR_TASK_PERF_COUNTERS_ENABLE); + prctl(PR_TASK_PERF_EVENTS_ENABLE); - prctl(PR_TASK_PERF_COUNTERS_DISABLE); + prctl(PR_TASK_PERF_EVENTS_DISABLE); This applies to all counters on the current process, whether created by this process or by another, and doesn't affect any counters that @@ -447,11 +447,11 @@ Arch requirements If your architecture does not have hardware performance metrics, you can still use the generic software counters based on hrtimers for sampling. -So to start with, in order to add HAVE_PERF_COUNTERS to your Kconfig, you +So to start with, in order to add HAVE_PERF_EVENTS to your Kconfig, you will need at least this: - - asm/perf_counter.h - a basic stub will suffice at first + - asm/perf_event.h - a basic stub will suffice at first - support for atomic64 types (and associated helper functions) - - set_perf_counter_pending() implemented + - set_perf_event_pending() implemented If your architecture does have hardware capabilities, you can override the -weak stub hw_perf_counter_init() to register hardware counters. +weak stub hw_perf_event_init() to register hardware counters. diff --git a/tools/perf/perf.h b/tools/perf/perf.h index 2abeb20d0bf3..8cc4623afd6f 100644 --- a/tools/perf/perf.h +++ b/tools/perf/perf.h @@ -52,15 +52,15 @@ #include #include -#include "../../include/linux/perf_counter.h" +#include "../../include/linux/perf_event.h" #include "util/types.h" /* - * prctl(PR_TASK_PERF_COUNTERS_DISABLE) will (cheaply) disable all + * prctl(PR_TASK_PERF_EVENTS_DISABLE) will (cheaply) disable all * counters in the current task. */ -#define PR_TASK_PERF_COUNTERS_DISABLE 31 -#define PR_TASK_PERF_COUNTERS_ENABLE 32 +#define PR_TASK_PERF_EVENTS_DISABLE 31 +#define PR_TASK_PERF_EVENTS_ENABLE 32 #ifndef NSEC_PER_SEC # define NSEC_PER_SEC 1000000000ULL @@ -90,12 +90,12 @@ static inline unsigned long long rdclock(void) _min1 < _min2 ? _min1 : _min2; }) static inline int -sys_perf_counter_open(struct perf_counter_attr *attr, +sys_perf_event_open(struct perf_event_attr *attr, pid_t pid, int cpu, int group_fd, unsigned long flags) { attr->size = sizeof(*attr); - return syscall(__NR_perf_counter_open, attr, pid, cpu, + return syscall(__NR_perf_event_open, attr, pid, cpu, group_fd, flags); } diff --git a/tools/perf/util/event.h b/tools/perf/util/event.h index 018d414a09d1..2c9c26d6ded0 100644 --- a/tools/perf/util/event.h +++ b/tools/perf/util/event.h @@ -1,5 +1,5 @@ -#ifndef __PERF_EVENT_H -#define __PERF_EVENT_H +#ifndef __PERF_RECORD_H +#define __PERF_RECORD_H #include "../perf.h" #include "util.h" #include diff --git a/tools/perf/util/header.c b/tools/perf/util/header.c index bb4fca3efcc3..e306857b2c2b 100644 --- a/tools/perf/util/header.c +++ b/tools/perf/util/header.c @@ -9,7 +9,7 @@ /* * Create new perf.data header attribute: */ -struct perf_header_attr *perf_header_attr__new(struct perf_counter_attr *attr) +struct perf_header_attr *perf_header_attr__new(struct perf_event_attr *attr) { struct perf_header_attr *self = malloc(sizeof(*self)); @@ -134,7 +134,7 @@ struct perf_file_section { }; struct perf_file_attr { - struct perf_counter_attr attr; + struct perf_event_attr attr; struct perf_file_section ids; }; @@ -320,7 +320,7 @@ u64 perf_header__sample_type(struct perf_header *header) return type; } -struct perf_counter_attr * +struct perf_event_attr * perf_header__find_attr(u64 id, struct perf_header *header) { int i; diff --git a/tools/perf/util/header.h b/tools/perf/util/header.h index 7b0e84a87179..a0761bc7863c 100644 --- a/tools/perf/util/header.h +++ b/tools/perf/util/header.h @@ -1,12 +1,12 @@ #ifndef _PERF_HEADER_H #define _PERF_HEADER_H -#include "../../../include/linux/perf_counter.h" +#include "../../../include/linux/perf_event.h" #include #include "types.h" struct perf_header_attr { - struct perf_counter_attr attr; + struct perf_event_attr attr; int ids, size; u64 *id; off_t id_offset; @@ -34,11 +34,11 @@ char *perf_header__find_event(u64 id); struct perf_header_attr * -perf_header_attr__new(struct perf_counter_attr *attr); +perf_header_attr__new(struct perf_event_attr *attr); void perf_header_attr__add_id(struct perf_header_attr *self, u64 id); u64 perf_header__sample_type(struct perf_header *header); -struct perf_counter_attr * +struct perf_event_attr * perf_header__find_attr(u64 id, struct perf_header *header); diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c index 89172fd0038b..13ab4b842d49 100644 --- a/tools/perf/util/parse-events.c +++ b/tools/perf/util/parse-events.c @@ -10,7 +10,7 @@ int nr_counters; -struct perf_counter_attr attrs[MAX_COUNTERS]; +struct perf_event_attr attrs[MAX_COUNTERS]; struct event_symbol { u8 type; @@ -48,13 +48,13 @@ static struct event_symbol event_symbols[] = { { CSW(CPU_MIGRATIONS), "cpu-migrations", "migrations" }, }; -#define __PERF_COUNTER_FIELD(config, name) \ - ((config & PERF_COUNTER_##name##_MASK) >> PERF_COUNTER_##name##_SHIFT) +#define __PERF_EVENT_FIELD(config, name) \ + ((config & PERF_EVENT_##name##_MASK) >> PERF_EVENT_##name##_SHIFT) -#define PERF_COUNTER_RAW(config) __PERF_COUNTER_FIELD(config, RAW) -#define PERF_COUNTER_CONFIG(config) __PERF_COUNTER_FIELD(config, CONFIG) -#define PERF_COUNTER_TYPE(config) __PERF_COUNTER_FIELD(config, TYPE) -#define PERF_COUNTER_ID(config) __PERF_COUNTER_FIELD(config, EVENT) +#define PERF_EVENT_RAW(config) __PERF_EVENT_FIELD(config, RAW) +#define PERF_EVENT_CONFIG(config) __PERF_EVENT_FIELD(config, CONFIG) +#define PERF_EVENT_TYPE(config) __PERF_EVENT_FIELD(config, TYPE) +#define PERF_EVENT_ID(config) __PERF_EVENT_FIELD(config, EVENT) static const char *hw_event_names[] = { "cycles", @@ -352,7 +352,7 @@ static int parse_aliases(const char **str, const char *names[][MAX_ALIASES], int } static enum event_result -parse_generic_hw_event(const char **str, struct perf_counter_attr *attr) +parse_generic_hw_event(const char **str, struct perf_event_attr *attr) { const char *s = *str; int cache_type = -1, cache_op = -1, cache_result = -1; @@ -417,7 +417,7 @@ parse_single_tracepoint_event(char *sys_name, const char *evt_name, unsigned int evt_length, char *flags, - struct perf_counter_attr *attr, + struct perf_event_attr *attr, const char **strp) { char evt_path[MAXPATHLEN]; @@ -505,7 +505,7 @@ parse_subsystem_tracepoint_event(char *sys_name, char *flags) static enum event_result parse_tracepoint_event(const char **strp, - struct perf_counter_attr *attr) + struct perf_event_attr *attr) { const char *evt_name; char *flags; @@ -563,7 +563,7 @@ static int check_events(const char *str, unsigned int i) } static enum event_result -parse_symbolic_event(const char **strp, struct perf_counter_attr *attr) +parse_symbolic_event(const char **strp, struct perf_event_attr *attr) { const char *str = *strp; unsigned int i; @@ -582,7 +582,7 @@ parse_symbolic_event(const char **strp, struct perf_counter_attr *attr) } static enum event_result -parse_raw_event(const char **strp, struct perf_counter_attr *attr) +parse_raw_event(const char **strp, struct perf_event_attr *attr) { const char *str = *strp; u64 config; @@ -601,7 +601,7 @@ parse_raw_event(const char **strp, struct perf_counter_attr *attr) } static enum event_result -parse_numeric_event(const char **strp, struct perf_counter_attr *attr) +parse_numeric_event(const char **strp, struct perf_event_attr *attr) { const char *str = *strp; char *endp; @@ -623,7 +623,7 @@ parse_numeric_event(const char **strp, struct perf_counter_attr *attr) } static enum event_result -parse_event_modifier(const char **strp, struct perf_counter_attr *attr) +parse_event_modifier(const char **strp, struct perf_event_attr *attr) { const char *str = *strp; int eu = 1, ek = 1, eh = 1; @@ -656,7 +656,7 @@ parse_event_modifier(const char **strp, struct perf_counter_attr *attr) * Symbolic names are (almost) exactly matched. */ static enum event_result -parse_event_symbols(const char **str, struct perf_counter_attr *attr) +parse_event_symbols(const char **str, struct perf_event_attr *attr) { enum event_result ret; @@ -711,7 +711,7 @@ static void store_event_type(const char *orgname) int parse_events(const struct option *opt __used, const char *str, int unset __used) { - struct perf_counter_attr attr; + struct perf_event_attr attr; enum event_result ret; if (strchr(str, ':')) diff --git a/tools/perf/util/parse-events.h b/tools/perf/util/parse-events.h index 60704c15961f..30c608112845 100644 --- a/tools/perf/util/parse-events.h +++ b/tools/perf/util/parse-events.h @@ -16,7 +16,7 @@ extern struct tracepoint_path *tracepoint_id_to_path(u64 config); extern int nr_counters; -extern struct perf_counter_attr attrs[MAX_COUNTERS]; +extern struct perf_event_attr attrs[MAX_COUNTERS]; extern const char *event_name(int ctr); extern const char *__event_name(int type, u64 config); diff --git a/tools/perf/util/trace-event-info.c b/tools/perf/util/trace-event-info.c index 1fd824c1f1c4..af4b0573b37f 100644 --- a/tools/perf/util/trace-event-info.c +++ b/tools/perf/util/trace-event-info.c @@ -480,12 +480,12 @@ out: } static struct tracepoint_path * -get_tracepoints_path(struct perf_counter_attr *pattrs, int nb_counters) +get_tracepoints_path(struct perf_event_attr *pattrs, int nb_events) { struct tracepoint_path path, *ppath = &path; int i; - for (i = 0; i < nb_counters; i++) { + for (i = 0; i < nb_events; i++) { if (pattrs[i].type != PERF_TYPE_TRACEPOINT) continue; ppath->next = tracepoint_id_to_path(pattrs[i].config); @@ -496,7 +496,7 @@ get_tracepoints_path(struct perf_counter_attr *pattrs, int nb_counters) return path.next; } -void read_tracing_data(struct perf_counter_attr *pattrs, int nb_counters) +void read_tracing_data(struct perf_event_attr *pattrs, int nb_events) { char buf[BUFSIZ]; struct tracepoint_path *tps; @@ -530,7 +530,7 @@ void read_tracing_data(struct perf_counter_attr *pattrs, int nb_counters) page_size = getpagesize(); write_or_die(&page_size, 4); - tps = get_tracepoints_path(pattrs, nb_counters); + tps = get_tracepoints_path(pattrs, nb_events); read_header_files(); read_ftrace_files(tps); diff --git a/tools/perf/util/trace-event.h b/tools/perf/util/trace-event.h index d35ebf1e29ff..693f815c9429 100644 --- a/tools/perf/util/trace-event.h +++ b/tools/perf/util/trace-event.h @@ -240,6 +240,6 @@ unsigned long long raw_field_value(struct event *event, const char *name, void *data); void *raw_field_ptr(struct event *event, const char *name, void *data); -void read_tracing_data(struct perf_counter_attr *pattrs, int nb_counters); +void read_tracing_data(struct perf_event_attr *pattrs, int nb_events); #endif /* _TRACE_EVENTS_H */ -- cgit v1.2.3 From 57c0c15b5244320065374ad2c54f4fbec77a6428 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 21 Sep 2009 12:20:38 +0200 Subject: perf: Tidy up after the big rename - provide compatibility Kconfig entry for existing PERF_COUNTERS .config's - provide courtesy copy of old perf_counter.h, for user-space projects - small indentation fixups - fix up MAINTAINERS - fix small x86 printout fallout - fix up small PowerPC comment fallout (use 'counter' as in register) Reviewed-by: Arjan van de Ven Acked-by: Peter Zijlstra Cc: Mike Galbraith Cc: Paul Mackerras Cc: Benjamin Herrenschmidt Cc: Frederic Weisbecker LKML-Reference: Signed-off-by: Ingo Molnar --- MAINTAINERS | 2 +- arch/powerpc/include/asm/paca.h | 2 +- arch/powerpc/kernel/perf_event.c | 12 +- arch/x86/kernel/cpu/perf_event.c | 14 +- include/linux/perf_counter.h | 441 +++++++++++++++++++++++++++++++++++++++ include/linux/perf_event.h | 98 ++++----- init/Kconfig | 37 +++- kernel/perf_event.c | 4 +- 8 files changed, 534 insertions(+), 76 deletions(-) create mode 100644 include/linux/perf_counter.h (limited to 'arch/x86/kernel') diff --git a/MAINTAINERS b/MAINTAINERS index 43761a00e3f1..751a307dc44e 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -4000,7 +4000,7 @@ S: Maintained F: include/linux/delayacct.h F: kernel/delayacct.c -PERFORMANCE COUNTER SUBSYSTEM +PERFORMANCE EVENTS SUBSYSTEM M: Peter Zijlstra M: Paul Mackerras M: Ingo Molnar diff --git a/arch/powerpc/include/asm/paca.h b/arch/powerpc/include/asm/paca.h index 154f405b642f..7d8514ceceae 100644 --- a/arch/powerpc/include/asm/paca.h +++ b/arch/powerpc/include/asm/paca.h @@ -122,7 +122,7 @@ struct paca_struct { u8 soft_enabled; /* irq soft-enable flag */ u8 hard_enabled; /* set if irqs are enabled in MSR */ u8 io_sync; /* writel() needs spin_unlock sync */ - u8 perf_event_pending; /* PM interrupt while soft-disabled */ + u8 perf_event_pending; /* PM interrupt while soft-disabled */ /* Stuff for accurate time accounting */ u64 user_time; /* accumulated usermode TB ticks */ diff --git a/arch/powerpc/kernel/perf_event.c b/arch/powerpc/kernel/perf_event.c index c98321fcb459..197b7d958796 100644 --- a/arch/powerpc/kernel/perf_event.c +++ b/arch/powerpc/kernel/perf_event.c @@ -41,7 +41,7 @@ DEFINE_PER_CPU(struct cpu_hw_events, cpu_hw_events); struct power_pmu *ppmu; /* - * Normally, to ignore kernel events we set the FCS (freeze events + * Normally, to ignore kernel events we set the FCS (freeze counters * in supervisor mode) bit in MMCR0, but if the kernel runs with the * hypervisor bit set in the MSR, or if we are running on a processor * where the hypervisor bit is forced to 1 (as on Apple G5 processors), @@ -159,7 +159,7 @@ void perf_event_print_debug(void) } /* - * Read one performance monitor event (PMC). + * Read one performance monitor counter (PMC). */ static unsigned long read_pmc(int idx) { @@ -409,7 +409,7 @@ static void power_pmu_read(struct perf_event *event) val = read_pmc(event->hw.idx); } while (atomic64_cmpxchg(&event->hw.prev_count, prev, val) != prev); - /* The events are only 32 bits wide */ + /* The counters are only 32 bits wide */ delta = (val - prev) & 0xfffffffful; atomic64_add(delta, &event->count); atomic64_sub(delta, &event->hw.period_left); @@ -543,7 +543,7 @@ void hw_perf_disable(void) } /* - * Set the 'freeze events' bit. + * Set the 'freeze counters' bit. * The barrier is to make sure the mtspr has been * executed and the PMU has frozen the events * before we return. @@ -1124,7 +1124,7 @@ const struct pmu *hw_perf_event_init(struct perf_event *event) } /* - * A event has overflowed; update its count and record + * A counter has overflowed; update its count and record * things if requested. Note that interrupts are hard-disabled * here so there is no possibility of being interrupted. */ @@ -1271,7 +1271,7 @@ static void perf_event_interrupt(struct pt_regs *regs) /* * Reset MMCR0 to its normal value. This will set PMXE and - * clear FC (freeze events) and PMAO (perf mon alert occurred) + * clear FC (freeze counters) and PMAO (perf mon alert occurred) * and thus allow interrupts to occur again. * XXX might want to use MSR.PM to keep the events frozen until * we get back out of this interrupt. diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index 0d03629fb1a5..a3c7adb06b78 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -2081,13 +2081,13 @@ void __init init_hw_perf_events(void) perf_events_lapic_init(); register_die_notifier(&perf_event_nmi_notifier); - pr_info("... version: %d\n", x86_pmu.version); - pr_info("... bit width: %d\n", x86_pmu.event_bits); - pr_info("... generic events: %d\n", x86_pmu.num_events); - pr_info("... value mask: %016Lx\n", x86_pmu.event_mask); - pr_info("... max period: %016Lx\n", x86_pmu.max_period); - pr_info("... fixed-purpose events: %d\n", x86_pmu.num_events_fixed); - pr_info("... event mask: %016Lx\n", perf_event_mask); + pr_info("... version: %d\n", x86_pmu.version); + pr_info("... bit width: %d\n", x86_pmu.event_bits); + pr_info("... generic registers: %d\n", x86_pmu.num_events); + pr_info("... value mask: %016Lx\n", x86_pmu.event_mask); + pr_info("... max period: %016Lx\n", x86_pmu.max_period); + pr_info("... fixed-purpose events: %d\n", x86_pmu.num_events_fixed); + pr_info("... event mask: %016Lx\n", perf_event_mask); } static inline void x86_pmu_read(struct perf_event *event) diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h new file mode 100644 index 000000000000..368bd70f1d2d --- /dev/null +++ b/include/linux/perf_counter.h @@ -0,0 +1,441 @@ +/* + * NOTE: this file will be removed in a future kernel release, it is + * provided as a courtesy copy of user-space code that relies on the + * old (pre-rename) symbols and constants. + * + * Performance events: + * + * Copyright (C) 2008-2009, Thomas Gleixner + * Copyright (C) 2008-2009, Red Hat, Inc., Ingo Molnar + * Copyright (C) 2008-2009, Red Hat, Inc., Peter Zijlstra + * + * Data type definitions, declarations, prototypes. + * + * Started by: Thomas Gleixner and Ingo Molnar + * + * For licencing details see kernel-base/COPYING + */ +#ifndef _LINUX_PERF_COUNTER_H +#define _LINUX_PERF_COUNTER_H + +#include +#include +#include + +/* + * User-space ABI bits: + */ + +/* + * attr.type + */ +enum perf_type_id { + PERF_TYPE_HARDWARE = 0, + PERF_TYPE_SOFTWARE = 1, + PERF_TYPE_TRACEPOINT = 2, + PERF_TYPE_HW_CACHE = 3, + PERF_TYPE_RAW = 4, + + PERF_TYPE_MAX, /* non-ABI */ +}; + +/* + * Generalized performance counter event types, used by the + * attr.event_id parameter of the sys_perf_counter_open() + * syscall: + */ +enum perf_hw_id { + /* + * Common hardware events, generalized by the kernel: + */ + PERF_COUNT_HW_CPU_CYCLES = 0, + PERF_COUNT_HW_INSTRUCTIONS = 1, + PERF_COUNT_HW_CACHE_REFERENCES = 2, + PERF_COUNT_HW_CACHE_MISSES = 3, + PERF_COUNT_HW_BRANCH_INSTRUCTIONS = 4, + PERF_COUNT_HW_BRANCH_MISSES = 5, + PERF_COUNT_HW_BUS_CYCLES = 6, + + PERF_COUNT_HW_MAX, /* non-ABI */ +}; + +/* + * Generalized hardware cache counters: + * + * { L1-D, L1-I, LLC, ITLB, DTLB, BPU } x + * { read, write, prefetch } x + * { accesses, misses } + */ +enum perf_hw_cache_id { + PERF_COUNT_HW_CACHE_L1D = 0, + PERF_COUNT_HW_CACHE_L1I = 1, + PERF_COUNT_HW_CACHE_LL = 2, + PERF_COUNT_HW_CACHE_DTLB = 3, + PERF_COUNT_HW_CACHE_ITLB = 4, + PERF_COUNT_HW_CACHE_BPU = 5, + + PERF_COUNT_HW_CACHE_MAX, /* non-ABI */ +}; + +enum perf_hw_cache_op_id { + PERF_COUNT_HW_CACHE_OP_READ = 0, + PERF_COUNT_HW_CACHE_OP_WRITE = 1, + PERF_COUNT_HW_CACHE_OP_PREFETCH = 2, + + PERF_COUNT_HW_CACHE_OP_MAX, /* non-ABI */ +}; + +enum perf_hw_cache_op_result_id { + PERF_COUNT_HW_CACHE_RESULT_ACCESS = 0, + PERF_COUNT_HW_CACHE_RESULT_MISS = 1, + + PERF_COUNT_HW_CACHE_RESULT_MAX, /* non-ABI */ +}; + +/* + * Special "software" counters provided by the kernel, even if the hardware + * does not support performance counters. These counters measure various + * physical and sw events of the kernel (and allow the profiling of them as + * well): + */ +enum perf_sw_ids { + PERF_COUNT_SW_CPU_CLOCK = 0, + PERF_COUNT_SW_TASK_CLOCK = 1, + PERF_COUNT_SW_PAGE_FAULTS = 2, + PERF_COUNT_SW_CONTEXT_SWITCHES = 3, + PERF_COUNT_SW_CPU_MIGRATIONS = 4, + PERF_COUNT_SW_PAGE_FAULTS_MIN = 5, + PERF_COUNT_SW_PAGE_FAULTS_MAJ = 6, + + PERF_COUNT_SW_MAX, /* non-ABI */ +}; + +/* + * Bits that can be set in attr.sample_type to request information + * in the overflow packets. + */ +enum perf_counter_sample_format { + PERF_SAMPLE_IP = 1U << 0, + PERF_SAMPLE_TID = 1U << 1, + PERF_SAMPLE_TIME = 1U << 2, + PERF_SAMPLE_ADDR = 1U << 3, + PERF_SAMPLE_READ = 1U << 4, + PERF_SAMPLE_CALLCHAIN = 1U << 5, + PERF_SAMPLE_ID = 1U << 6, + PERF_SAMPLE_CPU = 1U << 7, + PERF_SAMPLE_PERIOD = 1U << 8, + PERF_SAMPLE_STREAM_ID = 1U << 9, + PERF_SAMPLE_RAW = 1U << 10, + + PERF_SAMPLE_MAX = 1U << 11, /* non-ABI */ +}; + +/* + * The format of the data returned by read() on a perf counter fd, + * as specified by attr.read_format: + * + * struct read_format { + * { u64 value; + * { u64 time_enabled; } && PERF_FORMAT_ENABLED + * { u64 time_running; } && PERF_FORMAT_RUNNING + * { u64 id; } && PERF_FORMAT_ID + * } && !PERF_FORMAT_GROUP + * + * { u64 nr; + * { u64 time_enabled; } && PERF_FORMAT_ENABLED + * { u64 time_running; } && PERF_FORMAT_RUNNING + * { u64 value; + * { u64 id; } && PERF_FORMAT_ID + * } cntr[nr]; + * } && PERF_FORMAT_GROUP + * }; + */ +enum perf_counter_read_format { + PERF_FORMAT_TOTAL_TIME_ENABLED = 1U << 0, + PERF_FORMAT_TOTAL_TIME_RUNNING = 1U << 1, + PERF_FORMAT_ID = 1U << 2, + PERF_FORMAT_GROUP = 1U << 3, + + PERF_FORMAT_MAX = 1U << 4, /* non-ABI */ +}; + +#define PERF_ATTR_SIZE_VER0 64 /* sizeof first published struct */ + +/* + * Hardware event to monitor via a performance monitoring counter: + */ +struct perf_counter_attr { + + /* + * Major type: hardware/software/tracepoint/etc. + */ + __u32 type; + + /* + * Size of the attr structure, for fwd/bwd compat. + */ + __u32 size; + + /* + * Type specific configuration information. + */ + __u64 config; + + union { + __u64 sample_period; + __u64 sample_freq; + }; + + __u64 sample_type; + __u64 read_format; + + __u64 disabled : 1, /* off by default */ + inherit : 1, /* children inherit it */ + pinned : 1, /* must always be on PMU */ + exclusive : 1, /* only group on PMU */ + exclude_user : 1, /* don't count user */ + exclude_kernel : 1, /* ditto kernel */ + exclude_hv : 1, /* ditto hypervisor */ + exclude_idle : 1, /* don't count when idle */ + mmap : 1, /* include mmap data */ + comm : 1, /* include comm data */ + freq : 1, /* use freq, not period */ + inherit_stat : 1, /* per task counts */ + enable_on_exec : 1, /* next exec enables */ + task : 1, /* trace fork/exit */ + watermark : 1, /* wakeup_watermark */ + + __reserved_1 : 49; + + union { + __u32 wakeup_events; /* wakeup every n events */ + __u32 wakeup_watermark; /* bytes before wakeup */ + }; + __u32 __reserved_2; + + __u64 __reserved_3; +}; + +/* + * Ioctls that can be done on a perf counter fd: + */ +#define PERF_COUNTER_IOC_ENABLE _IO ('$', 0) +#define PERF_COUNTER_IOC_DISABLE _IO ('$', 1) +#define PERF_COUNTER_IOC_REFRESH _IO ('$', 2) +#define PERF_COUNTER_IOC_RESET _IO ('$', 3) +#define PERF_COUNTER_IOC_PERIOD _IOW('$', 4, u64) +#define PERF_COUNTER_IOC_SET_OUTPUT _IO ('$', 5) + +enum perf_counter_ioc_flags { + PERF_IOC_FLAG_GROUP = 1U << 0, +}; + +/* + * Structure of the page that can be mapped via mmap + */ +struct perf_counter_mmap_page { + __u32 version; /* version number of this structure */ + __u32 compat_version; /* lowest version this is compat with */ + + /* + * Bits needed to read the hw counters in user-space. + * + * u32 seq; + * s64 count; + * + * do { + * seq = pc->lock; + * + * barrier() + * if (pc->index) { + * count = pmc_read(pc->index - 1); + * count += pc->offset; + * } else + * goto regular_read; + * + * barrier(); + * } while (pc->lock != seq); + * + * NOTE: for obvious reason this only works on self-monitoring + * processes. + */ + __u32 lock; /* seqlock for synchronization */ + __u32 index; /* hardware counter identifier */ + __s64 offset; /* add to hardware counter value */ + __u64 time_enabled; /* time counter active */ + __u64 time_running; /* time counter on cpu */ + + /* + * Hole for extension of the self monitor capabilities + */ + + __u64 __reserved[123]; /* align to 1k */ + + /* + * Control data for the mmap() data buffer. + * + * User-space reading the @data_head value should issue an rmb(), on + * SMP capable platforms, after reading this value -- see + * perf_counter_wakeup(). + * + * When the mapping is PROT_WRITE the @data_tail value should be + * written by userspace to reflect the last read data. In this case + * the kernel will not over-write unread data. + */ + __u64 data_head; /* head in the data section */ + __u64 data_tail; /* user-space written tail */ +}; + +#define PERF_EVENT_MISC_CPUMODE_MASK (3 << 0) +#define PERF_EVENT_MISC_CPUMODE_UNKNOWN (0 << 0) +#define PERF_EVENT_MISC_KERNEL (1 << 0) +#define PERF_EVENT_MISC_USER (2 << 0) +#define PERF_EVENT_MISC_HYPERVISOR (3 << 0) + +struct perf_event_header { + __u32 type; + __u16 misc; + __u16 size; +}; + +enum perf_event_type { + + /* + * The MMAP events record the PROT_EXEC mappings so that we can + * correlate userspace IPs to code. They have the following structure: + * + * struct { + * struct perf_event_header header; + * + * u32 pid, tid; + * u64 addr; + * u64 len; + * u64 pgoff; + * char filename[]; + * }; + */ + PERF_EVENT_MMAP = 1, + + /* + * struct { + * struct perf_event_header header; + * u64 id; + * u64 lost; + * }; + */ + PERF_EVENT_LOST = 2, + + /* + * struct { + * struct perf_event_header header; + * + * u32 pid, tid; + * char comm[]; + * }; + */ + PERF_EVENT_COMM = 3, + + /* + * struct { + * struct perf_event_header header; + * u32 pid, ppid; + * u32 tid, ptid; + * u64 time; + * }; + */ + PERF_EVENT_EXIT = 4, + + /* + * struct { + * struct perf_event_header header; + * u64 time; + * u64 id; + * u64 stream_id; + * }; + */ + PERF_EVENT_THROTTLE = 5, + PERF_EVENT_UNTHROTTLE = 6, + + /* + * struct { + * struct perf_event_header header; + * u32 pid, ppid; + * u32 tid, ptid; + * { u64 time; } && PERF_SAMPLE_TIME + * }; + */ + PERF_EVENT_FORK = 7, + + /* + * struct { + * struct perf_event_header header; + * u32 pid, tid; + * + * struct read_format values; + * }; + */ + PERF_EVENT_READ = 8, + + /* + * struct { + * struct perf_event_header header; + * + * { u64 ip; } && PERF_SAMPLE_IP + * { u32 pid, tid; } && PERF_SAMPLE_TID + * { u64 time; } && PERF_SAMPLE_TIME + * { u64 addr; } && PERF_SAMPLE_ADDR + * { u64 id; } && PERF_SAMPLE_ID + * { u64 stream_id;} && PERF_SAMPLE_STREAM_ID + * { u32 cpu, res; } && PERF_SAMPLE_CPU + * { u64 period; } && PERF_SAMPLE_PERIOD + * + * { struct read_format values; } && PERF_SAMPLE_READ + * + * { u64 nr, + * u64 ips[nr]; } && PERF_SAMPLE_CALLCHAIN + * + * # + * # The RAW record below is opaque data wrt the ABI + * # + * # That is, the ABI doesn't make any promises wrt to + * # the stability of its content, it may vary depending + * # on event, hardware, kernel version and phase of + * # the moon. + * # + * # In other words, PERF_SAMPLE_RAW contents are not an ABI. + * # + * + * { u32 size; + * char data[size];}&& PERF_SAMPLE_RAW + * }; + */ + PERF_EVENT_SAMPLE = 9, + + PERF_EVENT_MAX, /* non-ABI */ +}; + +enum perf_callchain_context { + PERF_CONTEXT_HV = (__u64)-32, + PERF_CONTEXT_KERNEL = (__u64)-128, + PERF_CONTEXT_USER = (__u64)-512, + + PERF_CONTEXT_GUEST = (__u64)-2048, + PERF_CONTEXT_GUEST_KERNEL = (__u64)-2176, + PERF_CONTEXT_GUEST_USER = (__u64)-2560, + + PERF_CONTEXT_MAX = (__u64)-4095, +}; + +#define PERF_FLAG_FD_NO_GROUP (1U << 0) +#define PERF_FLAG_FD_OUTPUT (1U << 1) + +/* + * In case some app still references the old symbols: + */ + +#define __NR_perf_counter_open __NR_perf_event_open + +#define PR_TASK_PERF_COUNTERS_DISABLE PR_TASK_PERF_EVENTS_DISABLE +#define PR_TASK_PERF_COUNTERS_ENABLE PR_TASK_PERF_EVENTS_ENABLE + +#endif /* _LINUX_PERF_COUNTER_H */ diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index ae9d9ed6df2a..acefaf71e6dd 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -1,15 +1,15 @@ /* - * Performance events: + * Performance events: * * Copyright (C) 2008-2009, Thomas Gleixner * Copyright (C) 2008-2009, Red Hat, Inc., Ingo Molnar * Copyright (C) 2008-2009, Red Hat, Inc., Peter Zijlstra * - * Data type definitions, declarations, prototypes. + * Data type definitions, declarations, prototypes. * * Started by: Thomas Gleixner and Ingo Molnar * - * For licencing details see kernel-base/COPYING + * For licencing details see kernel-base/COPYING */ #ifndef _LINUX_PERF_EVENT_H #define _LINUX_PERF_EVENT_H @@ -131,19 +131,19 @@ enum perf_event_sample_format { * as specified by attr.read_format: * * struct read_format { - * { u64 value; - * { u64 time_enabled; } && PERF_FORMAT_ENABLED - * { u64 time_running; } && PERF_FORMAT_RUNNING - * { u64 id; } && PERF_FORMAT_ID - * } && !PERF_FORMAT_GROUP + * { u64 value; + * { u64 time_enabled; } && PERF_FORMAT_ENABLED + * { u64 time_running; } && PERF_FORMAT_RUNNING + * { u64 id; } && PERF_FORMAT_ID + * } && !PERF_FORMAT_GROUP * - * { u64 nr; - * { u64 time_enabled; } && PERF_FORMAT_ENABLED - * { u64 time_running; } && PERF_FORMAT_RUNNING - * { u64 value; - * { u64 id; } && PERF_FORMAT_ID - * } cntr[nr]; - * } && PERF_FORMAT_GROUP + * { u64 nr; + * { u64 time_enabled; } && PERF_FORMAT_ENABLED + * { u64 time_running; } && PERF_FORMAT_RUNNING + * { u64 value; + * { u64 id; } && PERF_FORMAT_ID + * } cntr[nr]; + * } && PERF_FORMAT_GROUP * }; */ enum perf_event_read_format { @@ -152,7 +152,7 @@ enum perf_event_read_format { PERF_FORMAT_ID = 1U << 2, PERF_FORMAT_GROUP = 1U << 3, - PERF_FORMAT_MAX = 1U << 4, /* non-ABI */ + PERF_FORMAT_MAX = 1U << 4, /* non-ABI */ }; #define PERF_ATTR_SIZE_VER0 64 /* sizeof first published struct */ @@ -216,8 +216,8 @@ struct perf_event_attr { * Ioctls that can be done on a perf event fd: */ #define PERF_EVENT_IOC_ENABLE _IO ('$', 0) -#define PERF_EVENT_IOC_DISABLE _IO ('$', 1) -#define PERF_EVENT_IOC_REFRESH _IO ('$', 2) +#define PERF_EVENT_IOC_DISABLE _IO ('$', 1) +#define PERF_EVENT_IOC_REFRESH _IO ('$', 2) #define PERF_EVENT_IOC_RESET _IO ('$', 3) #define PERF_EVENT_IOC_PERIOD _IOW('$', 4, u64) #define PERF_EVENT_IOC_SET_OUTPUT _IO ('$', 5) @@ -314,9 +314,9 @@ enum perf_event_type { /* * struct { - * struct perf_event_header header; - * u64 id; - * u64 lost; + * struct perf_event_header header; + * u64 id; + * u64 lost; * }; */ PERF_RECORD_LOST = 2, @@ -383,23 +383,23 @@ enum perf_event_type { * { u64 id; } && PERF_SAMPLE_ID * { u64 stream_id;} && PERF_SAMPLE_STREAM_ID * { u32 cpu, res; } && PERF_SAMPLE_CPU - * { u64 period; } && PERF_SAMPLE_PERIOD + * { u64 period; } && PERF_SAMPLE_PERIOD * * { struct read_format values; } && PERF_SAMPLE_READ * * { u64 nr, * u64 ips[nr]; } && PERF_SAMPLE_CALLCHAIN * - * # - * # The RAW record below is opaque data wrt the ABI - * # - * # That is, the ABI doesn't make any promises wrt to - * # the stability of its content, it may vary depending - * # on event_id, hardware, kernel version and phase of - * # the moon. - * # - * # In other words, PERF_SAMPLE_RAW contents are not an ABI. - * # + * # + * # The RAW record below is opaque data wrt the ABI + * # + * # That is, the ABI doesn't make any promises wrt to + * # the stability of its content, it may vary depending + * # on event, hardware, kernel version and phase of + * # the moon. + * # + * # In other words, PERF_SAMPLE_RAW contents are not an ABI. + * # * * { u32 size; * char data[size];}&& PERF_SAMPLE_RAW @@ -503,10 +503,10 @@ struct pmu { * enum perf_event_active_state - the states of a event */ enum perf_event_active_state { - PERF_EVENT_STATE_ERROR = -2, + PERF_EVENT_STATE_ERROR = -2, PERF_EVENT_STATE_OFF = -1, PERF_EVENT_STATE_INACTIVE = 0, - PERF_EVENT_STATE_ACTIVE = 1, + PERF_EVENT_STATE_ACTIVE = 1, }; struct file; @@ -529,7 +529,7 @@ struct perf_mmap_data { long watermark; /* wakeup watermark */ - struct perf_event_mmap_page *user_page; + struct perf_event_mmap_page *user_page; void *data_pages[0]; }; @@ -694,14 +694,14 @@ struct perf_cpu_context { }; struct perf_output_handle { - struct perf_event *event; - struct perf_mmap_data *data; - unsigned long head; - unsigned long offset; - int nmi; - int sample; - int locked; - unsigned long flags; + struct perf_event *event; + struct perf_mmap_data *data; + unsigned long head; + unsigned long offset; + int nmi; + int sample; + int locked; + unsigned long flags; }; #ifdef CONFIG_PERF_EVENTS @@ -829,22 +829,22 @@ static inline void perf_event_task_sched_out(struct task_struct *task, struct task_struct *next, int cpu) { } static inline void -perf_event_task_tick(struct task_struct *task, int cpu) { } +perf_event_task_tick(struct task_struct *task, int cpu) { } static inline int perf_event_init_task(struct task_struct *child) { return 0; } static inline void perf_event_exit_task(struct task_struct *child) { } static inline void perf_event_free_task(struct task_struct *task) { } -static inline void perf_event_do_pending(void) { } -static inline void perf_event_print_debug(void) { } +static inline void perf_event_do_pending(void) { } +static inline void perf_event_print_debug(void) { } static inline void perf_disable(void) { } static inline void perf_enable(void) { } -static inline int perf_event_task_disable(void) { return -EINVAL; } -static inline int perf_event_task_enable(void) { return -EINVAL; } +static inline int perf_event_task_disable(void) { return -EINVAL; } +static inline int perf_event_task_enable(void) { return -EINVAL; } static inline void perf_sw_event(u32 event_id, u64 nr, int nmi, struct pt_regs *regs, u64 addr) { } -static inline void perf_event_mmap(struct vm_area_struct *vma) { } +static inline void perf_event_mmap(struct vm_area_struct *vma) { } static inline void perf_event_comm(struct task_struct *tsk) { } static inline void perf_event_fork(struct task_struct *tsk) { } static inline void perf_event_init(void) { } diff --git a/init/Kconfig b/init/Kconfig index cfdf5c322806..706728be312f 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -920,26 +920,31 @@ config HAVE_PERF_EVENTS help See tools/perf/design.txt for details. -menu "Performance Counters" +menu "Kernel Performance Events And Counters" config PERF_EVENTS - bool "Kernel Performance Counters" - default y if PROFILING + bool "Kernel performance events and counters" + default y if (PROFILING || PERF_COUNTERS) depends on HAVE_PERF_EVENTS select ANON_INODES help - Enable kernel support for performance counter hardware. + Enable kernel support for various performance events provided + by software and hardware. - Performance counters are special hardware registers available - on most modern CPUs. These registers count the number of certain + Software events are supported either build-in or via the + use of generic tracepoints. + + Most modern CPUs support performance events via performance + counter registers. These registers count the number of certain types of hw events: such as instructions executed, cachemisses suffered, or branches mis-predicted - without slowing down the kernel or applications. These registers can also trigger interrupts when a threshold number of events have passed - and can thus be used to profile the code that runs on that CPU. - The Linux Performance Counter subsystem provides an abstraction of - these hardware capabilities, available via a system call. It + The Linux Performance Event subsystem provides an abstraction of + these software and hardware cevent apabilities, available via a + system call and used by the "perf" utility in tools/perf/. It provides per task and per CPU counters, and it provides event capabilities on top of those. @@ -950,14 +955,26 @@ config EVENT_PROFILE depends on PERF_EVENTS && EVENT_TRACING default y help - Allow the use of tracepoints as software performance counters. + Allow the use of tracepoints as software performance events. - When this is enabled, you can create perf counters based on + When this is enabled, you can create perf events based on tracepoints using PERF_TYPE_TRACEPOINT and the tracepoint ID found in debugfs://tracing/events/*/*/id. (The -e/--events option to the perf tool can parse and interpret symbolic tracepoints, in the subsystem:tracepoint_name format.) +config PERF_COUNTERS + bool "Kernel performance counters (old config option)" + depends on HAVE_PERF_EVENTS + help + This config has been obsoleted by the PERF_EVENTS + config option - please see that one for details. + + It has no effect on the kernel whether you enable + it or not, it is a compatibility placeholder. + + Say N if unsure. + endmenu config VM_EVENT_COUNTERS diff --git a/kernel/perf_event.c b/kernel/perf_event.c index 6e8b99a04e1e..76ac4db405e9 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -1,12 +1,12 @@ /* - * Performance event core code + * Performance events core code: * * Copyright (C) 2008 Thomas Gleixner * Copyright (C) 2008-2009 Red Hat, Inc., Ingo Molnar * Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra * Copyright © 2009 Paul Mackerras, IBM Corp. * - * For licensing details see kernel-base/COPYING + * For licensing details see kernel-base/COPYING */ #include -- cgit v1.2.3 From 7da8b6ddc7a03346f825925e0d981ca2bd1ed617 Mon Sep 17 00:00:00 2001 From: Michael Tokarev Date: Wed, 22 Jul 2009 17:50:23 +0400 Subject: trivial: fix missing printk space in amd_k7_smp_check This trivial patch fixes one missing space in printk. I already fixed it about half a year ago or more, but the change (in arch/x86/kernel/cpu/smpboot.c at that time) didn't made into mainline yet. Signed-off-by: Michael Tokarev index 28e5f59..6c139ed 100644 Signed-off-by: Jiri Kosina --- arch/x86/kernel/cpu/amd.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index f32fa71ccf97..c910a716a71c 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -184,7 +184,7 @@ static void __cpuinit amd_k7_smp_check(struct cpuinfo_x86 *c) * approved Athlon */ WARN_ONCE(1, "WARNING: This combination of AMD" - "processors is not suitable for SMP.\n"); + " processors is not suitable for SMP.\n"); if (!test_taint(TAINT_UNSAFE_SMP)) add_taint(TAINT_UNSAFE_SMP); -- cgit v1.2.3 From 4481374ce88ba8f460c8b89f2572027bd27057d0 Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Mon, 21 Sep 2009 17:03:05 -0700 Subject: mm: replace various uses of num_physpages by totalram_pages Sizing of memory allocations shouldn't depend on the number of physical pages found in a system, as that generally includes (perhaps a huge amount of) non-RAM pages. The amount of what actually is usable as storage should instead be used as a basis here. Some of the calculations (i.e. those not intending to use high memory) should likely even use (totalram_pages - totalhigh_pages). Signed-off-by: Jan Beulich Acked-by: Rusty Russell Acked-by: Ingo Molnar Cc: Dave Airlie Cc: Kyle McMartin Cc: Jeremy Fitzhardinge Cc: Pekka Enberg Cc: Hugh Dickins Cc: "David S. Miller" Cc: Patrick McHardy Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/kernel/microcode_core.c | 4 ++-- drivers/char/agp/backend.c | 4 ++-- drivers/parisc/ccio-dma.c | 4 ++-- drivers/parisc/sba_iommu.c | 4 ++-- drivers/xen/balloon.c | 4 ---- fs/ntfs/malloc.h | 2 +- include/linux/mm.h | 1 + init/main.c | 4 ++-- mm/slab.c | 2 +- mm/swap.c | 2 +- mm/vmalloc.c | 4 ++-- net/core/sock.c | 4 ++-- net/dccp/proto.c | 6 +++--- net/decnet/dn_route.c | 2 +- net/ipv4/route.c | 2 +- net/ipv4/tcp.c | 4 ++-- net/netfilter/nf_conntrack_core.c | 4 ++-- net/netfilter/x_tables.c | 2 +- net/netfilter/xt_hashlimit.c | 8 ++++---- net/netlink/af_netlink.c | 6 +++--- net/sctp/protocol.c | 6 +++--- 21 files changed, 38 insertions(+), 41 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/microcode_core.c b/arch/x86/kernel/microcode_core.c index 0db7969b0dde..378e9a8f1bf8 100644 --- a/arch/x86/kernel/microcode_core.c +++ b/arch/x86/kernel/microcode_core.c @@ -210,8 +210,8 @@ static ssize_t microcode_write(struct file *file, const char __user *buf, { ssize_t ret = -EINVAL; - if ((len >> PAGE_SHIFT) > num_physpages) { - pr_err("microcode: too much data (max %ld pages)\n", num_physpages); + if ((len >> PAGE_SHIFT) > totalram_pages) { + pr_err("microcode: too much data (max %ld pages)\n", totalram_pages); return ret; } diff --git a/drivers/char/agp/backend.c b/drivers/char/agp/backend.c index ad87753f6de4..a56ca080e108 100644 --- a/drivers/char/agp/backend.c +++ b/drivers/char/agp/backend.c @@ -114,9 +114,9 @@ static int agp_find_max(void) long memory, index, result; #if PAGE_SHIFT < 20 - memory = num_physpages >> (20 - PAGE_SHIFT); + memory = totalram_pages >> (20 - PAGE_SHIFT); #else - memory = num_physpages << (PAGE_SHIFT - 20); + memory = totalram_pages << (PAGE_SHIFT - 20); #endif index = 1; diff --git a/drivers/parisc/ccio-dma.c b/drivers/parisc/ccio-dma.c index a45b0c0d574e..a6b4a5a53d40 100644 --- a/drivers/parisc/ccio-dma.c +++ b/drivers/parisc/ccio-dma.c @@ -1266,7 +1266,7 @@ ccio_ioc_init(struct ioc *ioc) ** Hot-Plug/Removal of PCI cards. (aka PCI OLARD). */ - iova_space_size = (u32) (num_physpages / count_parisc_driver(&ccio_driver)); + iova_space_size = (u32) (totalram_pages / count_parisc_driver(&ccio_driver)); /* limit IOVA space size to 1MB-1GB */ @@ -1305,7 +1305,7 @@ ccio_ioc_init(struct ioc *ioc) DBG_INIT("%s() hpa 0x%p mem %luMB IOV %dMB (%d bits)\n", __func__, ioc->ioc_regs, - (unsigned long) num_physpages >> (20 - PAGE_SHIFT), + (unsigned long) totalram_pages >> (20 - PAGE_SHIFT), iova_space_size>>20, iov_order + PAGE_SHIFT); diff --git a/drivers/parisc/sba_iommu.c b/drivers/parisc/sba_iommu.c index 123d8fe3427d..57a6d19eba4c 100644 --- a/drivers/parisc/sba_iommu.c +++ b/drivers/parisc/sba_iommu.c @@ -1390,7 +1390,7 @@ sba_ioc_init(struct parisc_device *sba, struct ioc *ioc, int ioc_num) ** for DMA hints - ergo only 30 bits max. */ - iova_space_size = (u32) (num_physpages/global_ioc_cnt); + iova_space_size = (u32) (totalram_pages/global_ioc_cnt); /* limit IOVA space size to 1MB-1GB */ if (iova_space_size < (1 << (20 - PAGE_SHIFT))) { @@ -1415,7 +1415,7 @@ sba_ioc_init(struct parisc_device *sba, struct ioc *ioc, int ioc_num) DBG_INIT("%s() hpa 0x%lx mem %ldMB IOV %dMB (%d bits)\n", __func__, ioc->ioc_hpa, - (unsigned long) num_physpages >> (20 - PAGE_SHIFT), + (unsigned long) totalram_pages >> (20 - PAGE_SHIFT), iova_space_size>>20, iov_order + PAGE_SHIFT); diff --git a/drivers/xen/balloon.c b/drivers/xen/balloon.c index f5bbd9e83416..1b7123eb5d7b 100644 --- a/drivers/xen/balloon.c +++ b/drivers/xen/balloon.c @@ -96,11 +96,7 @@ static struct balloon_stats balloon_stats; /* We increase/decrease in batches which fit in a page */ static unsigned long frame_list[PAGE_SIZE / sizeof(unsigned long)]; -/* VM /proc information for memory */ -extern unsigned long totalram_pages; - #ifdef CONFIG_HIGHMEM -extern unsigned long totalhigh_pages; #define inc_totalhigh_pages() (totalhigh_pages++) #define dec_totalhigh_pages() (totalhigh_pages--) #else diff --git a/fs/ntfs/malloc.h b/fs/ntfs/malloc.h index cd0be3f5c3cd..a44b14cbceeb 100644 --- a/fs/ntfs/malloc.h +++ b/fs/ntfs/malloc.h @@ -47,7 +47,7 @@ static inline void *__ntfs_malloc(unsigned long size, gfp_t gfp_mask) return kmalloc(PAGE_SIZE, gfp_mask & ~__GFP_HIGHMEM); /* return (void *)__get_free_page(gfp_mask); */ } - if (likely(size >> PAGE_SHIFT < num_physpages)) + if (likely((size >> PAGE_SHIFT) < totalram_pages)) return __vmalloc(size, gfp_mask, PAGE_KERNEL); return NULL; } diff --git a/include/linux/mm.h b/include/linux/mm.h index d808cf832c4d..19ff81c49ba6 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -25,6 +25,7 @@ extern unsigned long max_mapnr; #endif extern unsigned long num_physpages; +extern unsigned long totalram_pages; extern void * high_memory; extern int page_cluster; diff --git a/init/main.c b/init/main.c index 34971becbd3c..2c48c3153163 100644 --- a/init/main.c +++ b/init/main.c @@ -668,12 +668,12 @@ asmlinkage void __init start_kernel(void) #endif thread_info_cache_init(); cred_init(); - fork_init(num_physpages); + fork_init(totalram_pages); proc_caches_init(); buffer_init(); key_init(); security_init(); - vfs_caches_init(num_physpages); + vfs_caches_init(totalram_pages); radix_tree_init(); signals_init(); /* rootfs populating might need page-writeback */ diff --git a/mm/slab.c b/mm/slab.c index 7b5d4deacfcd..7dfa481c96ba 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -1384,7 +1384,7 @@ void __init kmem_cache_init(void) * Fragmentation resistance on low memory - only use bigger * page orders on machines with more than 32MB of memory. */ - if (num_physpages > (32 << 20) >> PAGE_SHIFT) + if (totalram_pages > (32 << 20) >> PAGE_SHIFT) slab_break_gfp_order = BREAK_GFP_ORDER_HI; /* Bootstrap is tricky, because several objects are allocated diff --git a/mm/swap.c b/mm/swap.c index 4a8a59e671f7..308e57d8d7ed 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -496,7 +496,7 @@ EXPORT_SYMBOL(pagevec_lookup_tag); */ void __init swap_setup(void) { - unsigned long megs = num_physpages >> (20 - PAGE_SHIFT); + unsigned long megs = totalram_pages >> (20 - PAGE_SHIFT); #ifdef CONFIG_SWAP bdi_init(swapper_space.backing_dev_info); diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 9216b2555d07..5535da1d6961 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -1386,7 +1386,7 @@ void *vmap(struct page **pages, unsigned int count, might_sleep(); - if (count > num_physpages) + if (count > totalram_pages) return NULL; area = get_vm_area_caller((count << PAGE_SHIFT), flags, @@ -1493,7 +1493,7 @@ static void *__vmalloc_node(unsigned long size, gfp_t gfp_mask, pgprot_t prot, unsigned long real_size = size; size = PAGE_ALIGN(size); - if (!size || (size >> PAGE_SHIFT) > num_physpages) + if (!size || (size >> PAGE_SHIFT) > totalram_pages) return NULL; area = __get_vm_area_node(size, VM_ALLOC, VMALLOC_START, VMALLOC_END, diff --git a/net/core/sock.c b/net/core/sock.c index 30d5446512f9..524712a7b154 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -1206,12 +1206,12 @@ EXPORT_SYMBOL_GPL(sk_setup_caps); void __init sk_init(void) { - if (num_physpages <= 4096) { + if (totalram_pages <= 4096) { sysctl_wmem_max = 32767; sysctl_rmem_max = 32767; sysctl_wmem_default = 32767; sysctl_rmem_default = 32767; - } else if (num_physpages >= 131072) { + } else if (totalram_pages >= 131072) { sysctl_wmem_max = 131071; sysctl_rmem_max = 131071; } diff --git a/net/dccp/proto.c b/net/dccp/proto.c index 923db06c7e55..bc4467082a00 100644 --- a/net/dccp/proto.c +++ b/net/dccp/proto.c @@ -1049,10 +1049,10 @@ static int __init dccp_init(void) * * The methodology is similar to that of the buffer cache. */ - if (num_physpages >= (128 * 1024)) - goal = num_physpages >> (21 - PAGE_SHIFT); + if (totalram_pages >= (128 * 1024)) + goal = totalram_pages >> (21 - PAGE_SHIFT); else - goal = num_physpages >> (23 - PAGE_SHIFT); + goal = totalram_pages >> (23 - PAGE_SHIFT); if (thash_entries) goal = (thash_entries * diff --git a/net/decnet/dn_route.c b/net/decnet/dn_route.c index 9383d3e5a1ab..57662cabaf9b 100644 --- a/net/decnet/dn_route.c +++ b/net/decnet/dn_route.c @@ -1750,7 +1750,7 @@ void __init dn_route_init(void) dn_route_timer.expires = jiffies + decnet_dst_gc_interval * HZ; add_timer(&dn_route_timer); - goal = num_physpages >> (26 - PAGE_SHIFT); + goal = totalram_pages >> (26 - PAGE_SHIFT); for(order = 0; (1UL << order) < goal; order++) /* NOTHING */; diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 91867d3e6328..df9347314538 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -3414,7 +3414,7 @@ int __init ip_rt_init(void) alloc_large_system_hash("IP route cache", sizeof(struct rt_hash_bucket), rhash_entries, - (num_physpages >= 128 * 1024) ? + (totalram_pages >= 128 * 1024) ? 15 : 17, 0, &rt_hash_log, diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 19a0612b8a20..21387ebabf00 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -2862,7 +2862,7 @@ void __init tcp_init(void) alloc_large_system_hash("TCP established", sizeof(struct inet_ehash_bucket), thash_entries, - (num_physpages >= 128 * 1024) ? + (totalram_pages >= 128 * 1024) ? 13 : 15, 0, &tcp_hashinfo.ehash_size, @@ -2879,7 +2879,7 @@ void __init tcp_init(void) alloc_large_system_hash("TCP bind", sizeof(struct inet_bind_hashbucket), tcp_hashinfo.ehash_size, - (num_physpages >= 128 * 1024) ? + (totalram_pages >= 128 * 1024) ? 13 : 15, 0, &tcp_hashinfo.bhash_size, diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index b37109817a98..7c9ec3dee96e 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -1245,9 +1245,9 @@ static int nf_conntrack_init_init_net(void) * machine has 512 buckets. >= 1GB machines have 16384 buckets. */ if (!nf_conntrack_htable_size) { nf_conntrack_htable_size - = (((num_physpages << PAGE_SHIFT) / 16384) + = (((totalram_pages << PAGE_SHIFT) / 16384) / sizeof(struct hlist_head)); - if (num_physpages > (1024 * 1024 * 1024 / PAGE_SIZE)) + if (totalram_pages > (1024 * 1024 * 1024 / PAGE_SIZE)) nf_conntrack_htable_size = 16384; if (nf_conntrack_htable_size < 32) nf_conntrack_htable_size = 32; diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c index a6ac83a93348..f01955cce314 100644 --- a/net/netfilter/x_tables.c +++ b/net/netfilter/x_tables.c @@ -617,7 +617,7 @@ struct xt_table_info *xt_alloc_table_info(unsigned int size) int cpu; /* Pedantry: prevent them from hitting BUG() in vmalloc.c --RR */ - if ((SMP_ALIGN(size) >> PAGE_SHIFT) + 2 > num_physpages) + if ((SMP_ALIGN(size) >> PAGE_SHIFT) + 2 > totalram_pages) return NULL; newinfo = kzalloc(XT_TABLE_INFO_SZ, GFP_KERNEL); diff --git a/net/netfilter/xt_hashlimit.c b/net/netfilter/xt_hashlimit.c index 219dcdbe388c..dd16e404424f 100644 --- a/net/netfilter/xt_hashlimit.c +++ b/net/netfilter/xt_hashlimit.c @@ -194,9 +194,9 @@ static int htable_create_v0(struct xt_hashlimit_info *minfo, u_int8_t family) if (minfo->cfg.size) size = minfo->cfg.size; else { - size = ((num_physpages << PAGE_SHIFT) / 16384) / + size = ((totalram_pages << PAGE_SHIFT) / 16384) / sizeof(struct list_head); - if (num_physpages > (1024 * 1024 * 1024 / PAGE_SIZE)) + if (totalram_pages > (1024 * 1024 * 1024 / PAGE_SIZE)) size = 8192; if (size < 16) size = 16; @@ -266,9 +266,9 @@ static int htable_create(struct xt_hashlimit_mtinfo1 *minfo, u_int8_t family) if (minfo->cfg.size) { size = minfo->cfg.size; } else { - size = (num_physpages << PAGE_SHIFT) / 16384 / + size = (totalram_pages << PAGE_SHIFT) / 16384 / sizeof(struct list_head); - if (num_physpages > 1024 * 1024 * 1024 / PAGE_SIZE) + if (totalram_pages > 1024 * 1024 * 1024 / PAGE_SIZE) size = 8192; if (size < 16) size = 16; diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index c5aab6a368ce..55180b99562a 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -2091,10 +2091,10 @@ static int __init netlink_proto_init(void) if (!nl_table) goto panic; - if (num_physpages >= (128 * 1024)) - limit = num_physpages >> (21 - PAGE_SHIFT); + if (totalram_pages >= (128 * 1024)) + limit = totalram_pages >> (21 - PAGE_SHIFT); else - limit = num_physpages >> (23 - PAGE_SHIFT); + limit = totalram_pages >> (23 - PAGE_SHIFT); order = get_bitmask_order(limit) - 1 + PAGE_SHIFT; limit = (1UL << order) / sizeof(struct hlist_head); diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c index c557f1fb1c66..612dc878e05c 100644 --- a/net/sctp/protocol.c +++ b/net/sctp/protocol.c @@ -1184,10 +1184,10 @@ SCTP_STATIC __init int sctp_init(void) /* Size and allocate the association hash table. * The methodology is similar to that of the tcp hash tables. */ - if (num_physpages >= (128 * 1024)) - goal = num_physpages >> (22 - PAGE_SHIFT); + if (totalram_pages >= (128 * 1024)) + goal = totalram_pages >> (22 - PAGE_SHIFT); else - goal = num_physpages >> (24 - PAGE_SHIFT); + goal = totalram_pages >> (24 - PAGE_SHIFT); for (order = 0; (1UL << order) < goal; order++) ; -- cgit v1.2.3 From 3c1596efe167322dae87f8390d36f91ce2d7f936 Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Mon, 21 Sep 2009 17:03:06 -0700 Subject: mm: don't use alloc_bootmem_low() where not strictly needed Since alloc_bootmem() will never return inaccessible (via virtual addressing) memory anyway, using the ..._low() variant only makes sense when the physical address range of the allocated memory must fulfill further constraints, espacially since on 64-bits (or more generally in all cases where the pools the two variants allocate from are than the full available range. Probably the use in alloc_tce_table() could also be eliminated (based on code inspection of pci-calgary_64.c), but that seems too risky given I know nothing about that hardware and have no way to test it. Signed-off-by: Jan Beulich Cc: Ingo Molnar Cc: Thomas Gleixner Cc: "H. Peter Anvin" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/kernel/e820.c | 2 +- arch/x86/mm/init_32.c | 4 ++-- drivers/firmware/memmap.c | 2 +- kernel/power/snapshot.c | 2 +- 4 files changed, 5 insertions(+), 5 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index a3210ce1eccd..85419bb7d4ab 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -1331,7 +1331,7 @@ void __init e820_reserve_resources(void) struct resource *res; u64 end; - res = alloc_bootmem_low(sizeof(struct resource) * e820.nr_map); + res = alloc_bootmem(sizeof(struct resource) * e820.nr_map); e820_res = res; for (i = 0; i < e820.nr_map; i++) { end = e820.map[i].addr + e820.map[i].size - 1; diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index 95e877f5b846..b49b4f67453d 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -84,7 +84,7 @@ static pmd_t * __init one_md_table_init(pgd_t *pgd) #ifdef CONFIG_X86_PAE if (!(pgd_val(*pgd) & _PAGE_PRESENT)) { if (after_bootmem) - pmd_table = (pmd_t *)alloc_bootmem_low_pages(PAGE_SIZE); + pmd_table = (pmd_t *)alloc_bootmem_pages(PAGE_SIZE); else pmd_table = (pmd_t *)alloc_low_page(); paravirt_alloc_pmd(&init_mm, __pa(pmd_table) >> PAGE_SHIFT); @@ -116,7 +116,7 @@ static pte_t * __init one_page_table_init(pmd_t *pmd) #endif if (!page_table) page_table = - (pte_t *)alloc_bootmem_low_pages(PAGE_SIZE); + (pte_t *)alloc_bootmem_pages(PAGE_SIZE); } else page_table = (pte_t *)alloc_low_page(); diff --git a/drivers/firmware/memmap.c b/drivers/firmware/memmap.c index d5ea8a68d338..56f9234781fa 100644 --- a/drivers/firmware/memmap.c +++ b/drivers/firmware/memmap.c @@ -164,7 +164,7 @@ int __init firmware_map_add_early(u64 start, u64 end, const char *type) { struct firmware_map_entry *entry; - entry = alloc_bootmem_low(sizeof(struct firmware_map_entry)); + entry = alloc_bootmem(sizeof(struct firmware_map_entry)); if (WARN_ON(!entry)) return -ENOMEM; diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index 97955b0e44f4..36cb168e4330 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -619,7 +619,7 @@ __register_nosave_region(unsigned long start_pfn, unsigned long end_pfn, BUG_ON(!region); } else /* This allocation cannot fail */ - region = alloc_bootmem_low(sizeof(struct nosave_region)); + region = alloc_bootmem(sizeof(struct nosave_region)); region->start_pfn = start_pfn; region->end_pfn = end_pfn; list_add_tail(®ion->list, &nosave_regions); -- cgit v1.2.3 From 3967684006f30c253bc6d4a6604d1bad4a7fc672 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 22 Sep 2009 15:50:24 +0200 Subject: x86: mce: Clean up thermal throttling state tracking code Instead of a mess of three separate percpu variables, consolidate the state into a single structure. Also clean up therm_throt_process(), use cleaner and more understandable variable names and a clearer logic. This, without changing the logic, makes the code more streamlined, more readable and smaller as well: text data bss dec hex filename 1487 169 4 1660 67c therm_throt.o.before 1432 176 4 1612 64c therm_throt.o.after Cc: Hidetoshi Seto Cc: Huang Ying Cc: Andi Kleen LKML-Reference: Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/mcheck/therm_throt.c | 63 ++++++++++++++++++++------------ 1 file changed, 39 insertions(+), 24 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/mcheck/therm_throt.c b/arch/x86/kernel/cpu/mcheck/therm_throt.c index 63a56d147e4a..db80b577f601 100644 --- a/arch/x86/kernel/cpu/mcheck/therm_throt.c +++ b/arch/x86/kernel/cpu/mcheck/therm_throt.c @@ -34,20 +34,30 @@ /* How long to wait between reporting thermal events */ #define CHECK_INTERVAL (300 * HZ) -static DEFINE_PER_CPU(__u64, next_check) = INITIAL_JIFFIES; -static DEFINE_PER_CPU(unsigned long, thermal_throttle_count); -static DEFINE_PER_CPU(bool, thermal_throttle_active); +/* + * Current thermal throttling state: + */ +struct thermal_state { + bool is_throttled; + + u64 next_check; + unsigned long throttle_count; +}; + +static DEFINE_PER_CPU(struct thermal_state, thermal_state); -static atomic_t therm_throt_en = ATOMIC_INIT(0); +static atomic_t therm_throt_en = ATOMIC_INIT(0); #ifdef CONFIG_SYSFS #define define_therm_throt_sysdev_one_ro(_name) \ static SYSDEV_ATTR(_name, 0444, therm_throt_sysdev_show_##_name, NULL) #define define_therm_throt_sysdev_show_func(name) \ -static ssize_t therm_throt_sysdev_show_##name(struct sys_device *dev, \ - struct sysdev_attribute *attr, \ - char *buf) \ + \ +static ssize_t therm_throt_sysdev_show_##name( \ + struct sys_device *dev, \ + struct sysdev_attribute *attr, \ + char *buf) \ { \ unsigned int cpu = dev->id; \ ssize_t ret; \ @@ -55,7 +65,7 @@ static ssize_t therm_throt_sysdev_show_##name(struct sys_device *dev, \ preempt_disable(); /* CPU hotplug */ \ if (cpu_online(cpu)) \ ret = sprintf(buf, "%lu\n", \ - per_cpu(thermal_throttle_##name, cpu)); \ + per_cpu(thermal_state, cpu).name); \ else \ ret = 0; \ preempt_enable(); \ @@ -63,11 +73,11 @@ static ssize_t therm_throt_sysdev_show_##name(struct sys_device *dev, \ return ret; \ } -define_therm_throt_sysdev_show_func(count); -define_therm_throt_sysdev_one_ro(count); +define_therm_throt_sysdev_show_func(throttle_count); +define_therm_throt_sysdev_one_ro(throttle_count); static struct attribute *thermal_throttle_attrs[] = { - &attr_count.attr, + &attr_throttle_count.attr, NULL }; @@ -93,33 +103,38 @@ static struct attribute_group thermal_throttle_attr_group = { * 1 : Event should be logged further, and a message has been * printed to the syslog. */ -static int therm_throt_process(int curr) +static int therm_throt_process(bool is_throttled) { - unsigned int cpu = smp_processor_id(); - __u64 tmp_jiffs = get_jiffies_64(); - bool was_throttled = __get_cpu_var(thermal_throttle_active); - bool is_throttled = __get_cpu_var(thermal_throttle_active) = curr; + struct thermal_state *state; + unsigned int this_cpu; + bool was_throttled; + u64 now; + + this_cpu = smp_processor_id(); + now = get_jiffies_64(); + state = &per_cpu(thermal_state, this_cpu); + + was_throttled = state->is_throttled; + state->is_throttled = is_throttled; if (is_throttled) - __get_cpu_var(thermal_throttle_count)++; + state->throttle_count++; if (!(was_throttled ^ is_throttled) && - time_before64(tmp_jiffs, __get_cpu_var(next_check))) + time_before64(now, state->next_check)) return 0; - __get_cpu_var(next_check) = tmp_jiffs + CHECK_INTERVAL; + state->next_check = now + CHECK_INTERVAL; /* if we just entered the thermal event */ if (is_throttled) { - printk(KERN_CRIT "CPU%d: Temperature above threshold, " - "cpu clock throttled (total events = %lu)\n", - cpu, __get_cpu_var(thermal_throttle_count)); + printk(KERN_CRIT "CPU%d: Temperature above threshold, cpu clock throttled (total events = %lu)\n", this_cpu, state->throttle_count); add_taint(TAINT_MACHINE_CHECK); return 1; } if (was_throttled) { - printk(KERN_INFO "CPU%d: Temperature/speed normal\n", cpu); + printk(KERN_INFO "CPU%d: Temperature/speed normal\n", this_cpu); return 1; } @@ -213,7 +228,7 @@ static void intel_thermal_interrupt(void) __u64 msr_val; rdmsrl(MSR_IA32_THERM_STATUS, msr_val); - if (therm_throt_process(msr_val & THERM_STATUS_PROCHOT)) + if (therm_throt_process((msr_val & THERM_STATUS_PROCHOT) != 0)) mce_log_therm_throt_event(msr_val); } -- cgit v1.2.3 From b417c9fd8690637f0c91479435ab3e2bf450c038 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 22 Sep 2009 15:50:24 +0200 Subject: x86: mce: Fix thermal throttling message storm If a system switches back and forth between hot and cold mode, the MCE code will print a stream of critical kernel messages. Extend the throttling code to properly notice this, by only printing the first hot + cold transition and omitting the rest up to CHECK_INTERVAL (5 minutes). This way we'll only get a single incident of: [ 102.356584] CPU0: Temperature above threshold, cpu clock throttled (total events = 1) [ 102.357000] Disabling lock debugging due to kernel taint [ 102.369223] CPU0: Temperature/speed normal Every 5 minutes. The 'total events' count tells the number of cold/hot transitions detected, should overheating occur after 5 minutes again: [ 402.357580] CPU0: Temperature above threshold, cpu clock throttled (total events = 24891) [ 402.358001] CPU0: Temperature/speed normal [ 450.704142] Machine check events logged Cc: Hidetoshi Seto Cc: Huang Ying Cc: Andi Kleen LKML-Reference: Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/mcheck/therm_throt.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/mcheck/therm_throt.c b/arch/x86/kernel/cpu/mcheck/therm_throt.c index db80b577f601..b3a1dba75330 100644 --- a/arch/x86/kernel/cpu/mcheck/therm_throt.c +++ b/arch/x86/kernel/cpu/mcheck/therm_throt.c @@ -42,6 +42,7 @@ struct thermal_state { u64 next_check; unsigned long throttle_count; + unsigned long last_throttle_count; }; static DEFINE_PER_CPU(struct thermal_state, thermal_state); @@ -120,11 +121,12 @@ static int therm_throt_process(bool is_throttled) if (is_throttled) state->throttle_count++; - if (!(was_throttled ^ is_throttled) && - time_before64(now, state->next_check)) + if (time_before64(now, state->next_check) && + state->throttle_count != state->last_throttle_count) return 0; state->next_check = now + CHECK_INTERVAL; + state->last_throttle_count = state->throttle_count; /* if we just entered the thermal event */ if (is_throttled) { -- cgit v1.2.3 From 14c0abf14a5e67e793131116bd97f57da37ccce3 Mon Sep 17 00:00:00 2001 From: Huang Ying Date: Tue, 22 Sep 2009 14:35:57 +0800 Subject: x86: mce, inject: Use real inject-msg in raise_local Current raise_local() uses a struct mce that comes from mce_write() as a parameter instead of the real inject-msg, so when we set mce.finished = 0 to clear injected MCE, the real inject stays valid. This will cause the remaining inject-msg affect the next injection, which is not desired. To fix this, real inject-msg is used in raise_local instead of the one on the stack. This patch is based on the diagnosis and the fixes by Dean Nelson. Reported-by: Dean Nelson Signed-off-by: Huang Ying Cc: Hidetoshi Seto Cc: Andi Kleen LKML-Reference: <1253601357.15717.757.camel@yhuang-dev.sh.intel.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/mcheck/mce-inject.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/mcheck/mce-inject.c b/arch/x86/kernel/cpu/mcheck/mce-inject.c index 7029f0e2acad..472763d92098 100644 --- a/arch/x86/kernel/cpu/mcheck/mce-inject.c +++ b/arch/x86/kernel/cpu/mcheck/mce-inject.c @@ -98,8 +98,9 @@ static struct notifier_block mce_raise_nb = { }; /* Inject mce on current CPU */ -static int raise_local(struct mce *m) +static int raise_local(void) { + struct mce *m = &__get_cpu_var(injectm); int context = MCJ_CTX(m->inject_flags); int ret = 0; int cpu = m->extcpu; @@ -167,12 +168,12 @@ static void raise_mce(struct mce *m) } cpu_relax(); } - raise_local(m); + raise_local(); put_cpu(); put_online_cpus(); } else #endif - raise_local(m); + raise_local(); } /* Error injection interface */ -- cgit v1.2.3 From b60e714dc3ef2a2541e3b262b8b742ba08a07fc0 Mon Sep 17 00:00:00 2001 From: Roland McGrath Date: Tue, 22 Sep 2009 16:46:34 -0700 Subject: x86: ptrace: sysret path should reach syscall_trace_leave If TIF_SYSCALL_TRACE or TIF_SINGLESTEP is set while inside a syscall, the path back to user mode should get to syscall_trace_leave. This does happen in most circumstances. The exception to this is on the 64-bit syscall fastpath, when no such flag was set on syscall entry and nothing else has punted it off the fastpath for exit. That one exit fastpath fails to check for _TIF_WORK_SYSCALL_EXIT flags. This makes the behavior inconsistent with what 32-bit tasks see and what the native 32-bit kernel always does, and what 64-bit tasks see in all cases where the iret path is taken anyhow. Perhaps the only example that is affected is a ptrace stop inside do_fork (for PTRACE_O_TRACE{CLONE,FORK,VFORK,VFORKDONE}). Other syscalls with internal ptrace stop points (execve) already take the iret exit path for unrelated reasons. Test cases for both PTRACE_SYSCALL and PTRACE_SINGLESTEP variants are at: http://sources.redhat.com/cgi-bin/cvsweb.cgi/~checkout~/tests/ptrace-tests/tests/syscall-from-clone.c?cvsroot=systemtap http://sources.redhat.com/cgi-bin/cvsweb.cgi/~checkout~/tests/ptrace-tests/tests/step-from-clone.c?cvsroot=systemtap There was no special benefit to the sysret path's special path to call do_notify_resume, because it always takes the iret exit path at the end. So this change just makes the sysret exit path join the iret exit path for all the signals and ptrace cases. The fastpath still applies to the plain syscall-audit and resched cases. Signed-off-by: Roland McGrath CC: Oleg Nesterov --- arch/x86/kernel/entry_64.S | 22 ++++++++-------------- 1 file changed, 8 insertions(+), 14 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index 681c3fda7391..b5c061f8f358 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -536,20 +536,13 @@ sysret_signal: bt $TIF_SYSCALL_AUDIT,%edx jc sysret_audit #endif - /* edx: work flags (arg3) */ - leaq -ARGOFFSET(%rsp),%rdi # &pt_regs -> arg1 - xorl %esi,%esi # oldset -> arg2 - SAVE_REST - FIXUP_TOP_OF_STACK %r11 - call do_notify_resume - RESTORE_TOP_OF_STACK %r11 - RESTORE_REST - movl $_TIF_WORK_MASK,%edi - /* Use IRET because user could have changed frame. This - works because ptregscall_common has called FIXUP_TOP_OF_STACK. */ - DISABLE_INTERRUPTS(CLBR_NONE) - TRACE_IRQS_OFF - jmp int_with_check + /* + * We have a signal, or exit tracing or single-step. + * These all wind up with the iret return path anyway, + * so just join that path right now. + */ + FIXUP_TOP_OF_STACK %r11, -ARGOFFSET + jmp int_check_syscall_exit_work badsys: movq $-ENOSYS,RAX-ARGOFFSET(%rsp) @@ -654,6 +647,7 @@ int_careful: int_very_careful: TRACE_IRQS_ON ENABLE_INTERRUPTS(CLBR_NONE) +int_check_syscall_exit_work: SAVE_REST /* Check for syscall exit trace */ testl $_TIF_WORK_SYSCALL_EXIT,%edx -- cgit v1.2.3 From 08ff18e299b1a1c91f4911fe9f35c4550218c73f Mon Sep 17 00:00:00 2001 From: Roland McGrath Date: Tue, 22 Sep 2009 19:58:58 -0700 Subject: x86: ptrace: do not sign-extend orig_ax on write The high 32 bits of orig_ax will be ignored when it matters, so don't fiddle them when setting it. Signed-off-by: Roland McGrath --- arch/x86/kernel/ptrace.c | 19 +------------------ 1 file changed, 1 insertion(+), 18 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index 8d7d5c9c1be3..52222fab99f4 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -325,16 +325,6 @@ static int putreg(struct task_struct *child, return set_flags(child, value); #ifdef CONFIG_X86_64 - /* - * Orig_ax is really just a flag with small positive and - * negative values, so make sure to always sign-extend it - * from 32 bits so that it works correctly regardless of - * whether we come from a 32-bit environment or not. - */ - case offsetof(struct user_regs_struct, orig_ax): - value = (long) (s32) value; - break; - case offsetof(struct user_regs_struct,fs_base): if (value >= TASK_SIZE_OF(child)) return -EIO; @@ -1121,17 +1111,10 @@ static int putreg32(struct task_struct *child, unsigned regno, u32 value) R32(esi, si); R32(ebp, bp); R32(eax, ax); + R32(orig_eax, orig_ax); R32(eip, ip); R32(esp, sp); - case offsetof(struct user32, regs.orig_eax): - /* - * Sign-extend the value so that orig_eax = -1 - * causes (long)orig_ax < 0 tests to fire correctly. - */ - regs->orig_ax = (long) (s32) value; - break; - case offsetof(struct user32, regs.eflags): return set_flags(child, value); -- cgit v1.2.3 From 8cb3ed13935b9b523c2de7afc8f68473fe1d4531 Mon Sep 17 00:00:00 2001 From: Roland McGrath Date: Tue, 22 Sep 2009 20:12:07 -0700 Subject: x86: ptrace: set TS_COMPAT when 32-bit ptrace sets orig_eax>=0 The 32-bit ptrace syscall on a 64-bit kernel (32-bit debugger on 32-bit task) behaves differently than a native 32-bit kernel. When setting a register state of orig_eax>=0 and eax=-ERESTART* when the debugged task is NOT on its way out of a 32-bit syscall, the task will fail to do the syscall restart logic that it should do. Test case available at http://sources.redhat.com/cgi-bin/cvsweb.cgi/~checkout~/tests/ptrace-tests/tests/erestartsys-trap.c?cvsroot=systemtap This happens because the 32-bit ptrace syscall sets eax=0xffffffff when it sets orig_eax>=0. The resuming task will not sign-extend this for the -ERESTART* check because TS_COMPAT is not set. (So the task thinks it is restarting after a 64-bit syscall, not a 32-bit one.) The fix is to have 32-bit ptrace calls set TS_COMPAT when setting orig_eax>=0. This ensures that the 32-bit syscall restart logic will apply when the child resumes. Signed-off-by: Roland McGrath --- arch/x86/kernel/ptrace.c | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index 52222fab99f4..7b058a2dc66a 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -1111,10 +1111,22 @@ static int putreg32(struct task_struct *child, unsigned regno, u32 value) R32(esi, si); R32(ebp, bp); R32(eax, ax); - R32(orig_eax, orig_ax); R32(eip, ip); R32(esp, sp); + case offsetof(struct user32, regs.orig_eax): + /* + * A 32-bit debugger setting orig_eax means to restore + * the state of the task restarting a 32-bit syscall. + * Make sure we interpret the -ERESTART* codes correctly + * in case the task is not actually still sitting at the + * exit from a 32-bit syscall with TS_COMPAT still set. + */ + regs->orig_ax = value; + if (syscall_get_nr(child, regs) >= 0) + task_thread_info(child)->status |= TS_COMPAT; + break; + case offsetof(struct user32, regs.eflags): return set_flags(child, value); -- cgit v1.2.3 From 7d42896628202a551ad1107697cd215dc5fca099 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 23 Sep 2009 11:03:37 +0200 Subject: perf_event, x86: Fix 'perf sched record' crashing the machine Chris Malley reported that 'perf sched record' sometimes crashes his box with: [ 389.272175] BUG: unable to handle kernel paging request at ffffb300 [ 389.272294] IP: [] default_send_IPI_self+0x1d/0x50 [ 389.272366] *pde = 0073f067 *pte = 00000000 [ 389.274708] Call Trace: [ 389.274752] [] ? set_perf_event_pending+0x14/0x20 [ 389.274801] [] ? perf_output_unlock+0x121/0x1a0 [ 389.274848] [] ? perf_output_end+0x4a/0x70 [ 389.274893] [] ? __perf_event_overflow+0x240/0x2f0 [ 389.274942] [] ? atomic64_cmpxchg+0x1e/0x30 [ 389.274988] [] ? perf_swevent_ctx_event+0x1b4/0x1c0 [ 389.275035] [] ? perf_swevent_ctx_event+0x33/0x1c0 [ 389.275081] [] ? do_perf_sw_event+0xa7/0x160 [ 389.275127] [] ? perf_tp_event+0x82/0xa0 [ 389.275174] [] ? ftrace_profile_sched_stat_runtime+0xe6/0x120 [ 389.275224] [] ? ftrace_profile_sched_stat_runtime+0x0/0x120 [ 389.275273] [] ? update_curr+0x18a/0x230 [ 389.275318] [] ? put_prev_task_fair+0x155/0x160 [ 389.275366] [] ? sched_clock_cpu+0xd5/0x110 [ 389.275413] [] ? _spin_lock_irq+0x45/0x50 [ 389.275458] [] ? schedule+0x20e/0xb10 The problem is that the box has no lapic enabled: [ 0.042445] Local APIC not detected. Using dummy APIC emulation. The below seems like the best fix. We disabled all lapic bits, except the self-IPI-resend logic. Reported-by: Chris Malley Signed-off-by: Peter Zijlstra Cc: Cyrill Gorcunov Cc: Frederic Weisbecker LKML-Reference: <7863dc4c0909221409v7893bfd3o4b590d5951a233ba@mail.gmail.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index a3c7adb06b78..b5801c311846 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -1790,6 +1790,9 @@ void smp_perf_pending_interrupt(struct pt_regs *regs) void set_perf_event_pending(void) { #ifdef CONFIG_X86_LOCAL_APIC + if (!x86_pmu.apic || !x86_pmu_initialized()) + return; + apic->send_IPI_self(LOCAL_PENDING_VECTOR); #endif } -- cgit v1.2.3 From df6c516900d48df3581b23d37d6516a22ec4f2ca Mon Sep 17 00:00:00 2001 From: Jason Wessel Date: Thu, 20 Aug 2009 15:39:48 -0500 Subject: USB: ehci,dbgp,early_printk: split ehci debug driver from early_printk.c Move the dbgp early printk driver in advance of refactoring and adding new code, so the changes to this code are tracked separately from the move of the code. The drivers/usb/early directory will be the location of the current and future early usb code for driving usb devices prior initializing the standard interrupt driven USB drivers. Signed-off-by: Jason Wessel Cc: Ingo Molnar Cc: Andrew Morton Cc: Yinghai Lu Cc: "Eric W. Biederman" Signed-off-by: Greg Kroah-Hartman --- arch/x86/kernel/early_printk.c | 715 ---------------------------------------- drivers/usb/Makefile | 1 + drivers/usb/early/Makefile | 5 + drivers/usb/early/ehci-dbgp.c | 723 +++++++++++++++++++++++++++++++++++++++++ include/linux/usb/ehci_def.h | 6 + 5 files changed, 735 insertions(+), 715 deletions(-) create mode 100644 drivers/usb/early/Makefile create mode 100644 drivers/usb/early/ehci-dbgp.c (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/early_printk.c b/arch/x86/kernel/early_printk.c index b11cab3c323a..519a5e10be53 100644 --- a/arch/x86/kernel/early_printk.c +++ b/arch/x86/kernel/early_printk.c @@ -160,721 +160,6 @@ static struct console early_serial_console = { .index = -1, }; -#ifdef CONFIG_EARLY_PRINTK_DBGP - -static struct ehci_caps __iomem *ehci_caps; -static struct ehci_regs __iomem *ehci_regs; -static struct ehci_dbg_port __iomem *ehci_debug; -static unsigned int dbgp_endpoint_out; - -struct ehci_dev { - u32 bus; - u32 slot; - u32 func; -}; - -static struct ehci_dev ehci_dev; - -#define USB_DEBUG_DEVNUM 127 - -#define DBGP_DATA_TOGGLE 0x8800 - -static inline u32 dbgp_pid_update(u32 x, u32 tok) -{ - return ((x ^ DBGP_DATA_TOGGLE) & 0xffff00) | (tok & 0xff); -} - -static inline u32 dbgp_len_update(u32 x, u32 len) -{ - return (x & ~0x0f) | (len & 0x0f); -} - -/* - * USB Packet IDs (PIDs) - */ - -/* token */ -#define USB_PID_OUT 0xe1 -#define USB_PID_IN 0x69 -#define USB_PID_SOF 0xa5 -#define USB_PID_SETUP 0x2d -/* handshake */ -#define USB_PID_ACK 0xd2 -#define USB_PID_NAK 0x5a -#define USB_PID_STALL 0x1e -#define USB_PID_NYET 0x96 -/* data */ -#define USB_PID_DATA0 0xc3 -#define USB_PID_DATA1 0x4b -#define USB_PID_DATA2 0x87 -#define USB_PID_MDATA 0x0f -/* Special */ -#define USB_PID_PREAMBLE 0x3c -#define USB_PID_ERR 0x3c -#define USB_PID_SPLIT 0x78 -#define USB_PID_PING 0xb4 -#define USB_PID_UNDEF_0 0xf0 - -#define USB_PID_DATA_TOGGLE 0x88 -#define DBGP_CLAIM (DBGP_OWNER | DBGP_ENABLED | DBGP_INUSE) - -#define PCI_CAP_ID_EHCI_DEBUG 0xa - -#define HUB_ROOT_RESET_TIME 50 /* times are in msec */ -#define HUB_SHORT_RESET_TIME 10 -#define HUB_LONG_RESET_TIME 200 -#define HUB_RESET_TIMEOUT 500 - -#define DBGP_MAX_PACKET 8 - -static int dbgp_wait_until_complete(void) -{ - u32 ctrl; - int loop = 0x100000; - - do { - ctrl = readl(&ehci_debug->control); - /* Stop when the transaction is finished */ - if (ctrl & DBGP_DONE) - break; - } while (--loop > 0); - - if (!loop) - return -1; - - /* - * Now that we have observed the completed transaction, - * clear the done bit. - */ - writel(ctrl | DBGP_DONE, &ehci_debug->control); - return (ctrl & DBGP_ERROR) ? -DBGP_ERRCODE(ctrl) : DBGP_LEN(ctrl); -} - -static void __init dbgp_mdelay(int ms) -{ - int i; - - while (ms--) { - for (i = 0; i < 1000; i++) - outb(0x1, 0x80); - } -} - -static void dbgp_breath(void) -{ - /* Sleep to give the debug port a chance to breathe */ -} - -static int dbgp_wait_until_done(unsigned ctrl) -{ - u32 pids, lpid; - int ret; - int loop = 3; - -retry: - writel(ctrl | DBGP_GO, &ehci_debug->control); - ret = dbgp_wait_until_complete(); - pids = readl(&ehci_debug->pids); - lpid = DBGP_PID_GET(pids); - - if (ret < 0) - return ret; - - /* - * If the port is getting full or it has dropped data - * start pacing ourselves, not necessary but it's friendly. - */ - if ((lpid == USB_PID_NAK) || (lpid == USB_PID_NYET)) - dbgp_breath(); - - /* If I get a NACK reissue the transmission */ - if (lpid == USB_PID_NAK) { - if (--loop > 0) - goto retry; - } - - return ret; -} - -static void dbgp_set_data(const void *buf, int size) -{ - const unsigned char *bytes = buf; - u32 lo, hi; - int i; - - lo = hi = 0; - for (i = 0; i < 4 && i < size; i++) - lo |= bytes[i] << (8*i); - for (; i < 8 && i < size; i++) - hi |= bytes[i] << (8*(i - 4)); - writel(lo, &ehci_debug->data03); - writel(hi, &ehci_debug->data47); -} - -static void __init dbgp_get_data(void *buf, int size) -{ - unsigned char *bytes = buf; - u32 lo, hi; - int i; - - lo = readl(&ehci_debug->data03); - hi = readl(&ehci_debug->data47); - for (i = 0; i < 4 && i < size; i++) - bytes[i] = (lo >> (8*i)) & 0xff; - for (; i < 8 && i < size; i++) - bytes[i] = (hi >> (8*(i - 4))) & 0xff; -} - -static int dbgp_bulk_write(unsigned devnum, unsigned endpoint, - const char *bytes, int size) -{ - u32 pids, addr, ctrl; - int ret; - - if (size > DBGP_MAX_PACKET) - return -1; - - addr = DBGP_EPADDR(devnum, endpoint); - - pids = readl(&ehci_debug->pids); - pids = dbgp_pid_update(pids, USB_PID_OUT); - - ctrl = readl(&ehci_debug->control); - ctrl = dbgp_len_update(ctrl, size); - ctrl |= DBGP_OUT; - ctrl |= DBGP_GO; - - dbgp_set_data(bytes, size); - writel(addr, &ehci_debug->address); - writel(pids, &ehci_debug->pids); - - ret = dbgp_wait_until_done(ctrl); - if (ret < 0) - return ret; - - return ret; -} - -static int __init dbgp_bulk_read(unsigned devnum, unsigned endpoint, void *data, - int size) -{ - u32 pids, addr, ctrl; - int ret; - - if (size > DBGP_MAX_PACKET) - return -1; - - addr = DBGP_EPADDR(devnum, endpoint); - - pids = readl(&ehci_debug->pids); - pids = dbgp_pid_update(pids, USB_PID_IN); - - ctrl = readl(&ehci_debug->control); - ctrl = dbgp_len_update(ctrl, size); - ctrl &= ~DBGP_OUT; - ctrl |= DBGP_GO; - - writel(addr, &ehci_debug->address); - writel(pids, &ehci_debug->pids); - ret = dbgp_wait_until_done(ctrl); - if (ret < 0) - return ret; - - if (size > ret) - size = ret; - dbgp_get_data(data, size); - return ret; -} - -static int __init dbgp_control_msg(unsigned devnum, int requesttype, - int request, int value, int index, void *data, int size) -{ - u32 pids, addr, ctrl; - struct usb_ctrlrequest req; - int read; - int ret; - - read = (requesttype & USB_DIR_IN) != 0; - if (size > (read ? DBGP_MAX_PACKET:0)) - return -1; - - /* Compute the control message */ - req.bRequestType = requesttype; - req.bRequest = request; - req.wValue = cpu_to_le16(value); - req.wIndex = cpu_to_le16(index); - req.wLength = cpu_to_le16(size); - - pids = DBGP_PID_SET(USB_PID_DATA0, USB_PID_SETUP); - addr = DBGP_EPADDR(devnum, 0); - - ctrl = readl(&ehci_debug->control); - ctrl = dbgp_len_update(ctrl, sizeof(req)); - ctrl |= DBGP_OUT; - ctrl |= DBGP_GO; - - /* Send the setup message */ - dbgp_set_data(&req, sizeof(req)); - writel(addr, &ehci_debug->address); - writel(pids, &ehci_debug->pids); - ret = dbgp_wait_until_done(ctrl); - if (ret < 0) - return ret; - - /* Read the result */ - return dbgp_bulk_read(devnum, 0, data, size); -} - - -/* Find a PCI capability */ -static u32 __init find_cap(u32 num, u32 slot, u32 func, int cap) -{ - u8 pos; - int bytes; - - if (!(read_pci_config_16(num, slot, func, PCI_STATUS) & - PCI_STATUS_CAP_LIST)) - return 0; - - pos = read_pci_config_byte(num, slot, func, PCI_CAPABILITY_LIST); - for (bytes = 0; bytes < 48 && pos >= 0x40; bytes++) { - u8 id; - - pos &= ~3; - id = read_pci_config_byte(num, slot, func, pos+PCI_CAP_LIST_ID); - if (id == 0xff) - break; - if (id == cap) - return pos; - - pos = read_pci_config_byte(num, slot, func, - pos+PCI_CAP_LIST_NEXT); - } - return 0; -} - -static u32 __init __find_dbgp(u32 bus, u32 slot, u32 func) -{ - u32 class; - - class = read_pci_config(bus, slot, func, PCI_CLASS_REVISION); - if ((class >> 8) != PCI_CLASS_SERIAL_USB_EHCI) - return 0; - - return find_cap(bus, slot, func, PCI_CAP_ID_EHCI_DEBUG); -} - -static u32 __init find_dbgp(int ehci_num, u32 *rbus, u32 *rslot, u32 *rfunc) -{ - u32 bus, slot, func; - - for (bus = 0; bus < 256; bus++) { - for (slot = 0; slot < 32; slot++) { - for (func = 0; func < 8; func++) { - unsigned cap; - - cap = __find_dbgp(bus, slot, func); - - if (!cap) - continue; - if (ehci_num-- != 0) - continue; - *rbus = bus; - *rslot = slot; - *rfunc = func; - return cap; - } - } - } - return 0; -} - -static int __init ehci_reset_port(int port) -{ - u32 portsc; - u32 delay_time, delay; - int loop; - - /* Reset the usb debug port */ - portsc = readl(&ehci_regs->port_status[port - 1]); - portsc &= ~PORT_PE; - portsc |= PORT_RESET; - writel(portsc, &ehci_regs->port_status[port - 1]); - - delay = HUB_ROOT_RESET_TIME; - for (delay_time = 0; delay_time < HUB_RESET_TIMEOUT; - delay_time += delay) { - dbgp_mdelay(delay); - - portsc = readl(&ehci_regs->port_status[port - 1]); - if (portsc & PORT_RESET) { - /* force reset to complete */ - loop = 2; - writel(portsc & ~(PORT_RWC_BITS | PORT_RESET), - &ehci_regs->port_status[port - 1]); - do { - portsc = readl(&ehci_regs->port_status[port-1]); - } while ((portsc & PORT_RESET) && (--loop > 0)); - } - - /* Device went away? */ - if (!(portsc & PORT_CONNECT)) - return -ENOTCONN; - - /* bomb out completely if something weird happend */ - if ((portsc & PORT_CSC)) - return -EINVAL; - - /* If we've finished resetting, then break out of the loop */ - if (!(portsc & PORT_RESET) && (portsc & PORT_PE)) - return 0; - } - return -EBUSY; -} - -static int __init ehci_wait_for_port(int port) -{ - u32 status; - int ret, reps; - - for (reps = 0; reps < 3; reps++) { - dbgp_mdelay(100); - status = readl(&ehci_regs->status); - if (status & STS_PCD) { - ret = ehci_reset_port(port); - if (ret == 0) - return 0; - } - } - return -ENOTCONN; -} - -#ifdef DBGP_DEBUG -# define dbgp_printk early_printk -#else -static inline void dbgp_printk(const char *fmt, ...) { } -#endif - -typedef void (*set_debug_port_t)(int port); - -static void __init default_set_debug_port(int port) -{ -} - -static set_debug_port_t __initdata set_debug_port = default_set_debug_port; - -static void __init nvidia_set_debug_port(int port) -{ - u32 dword; - dword = read_pci_config(ehci_dev.bus, ehci_dev.slot, ehci_dev.func, - 0x74); - dword &= ~(0x0f<<12); - dword |= ((port & 0x0f)<<12); - write_pci_config(ehci_dev.bus, ehci_dev.slot, ehci_dev.func, 0x74, - dword); - dbgp_printk("set debug port to %d\n", port); -} - -static void __init detect_set_debug_port(void) -{ - u32 vendorid; - - vendorid = read_pci_config(ehci_dev.bus, ehci_dev.slot, ehci_dev.func, - 0x00); - - if ((vendorid & 0xffff) == 0x10de) { - dbgp_printk("using nvidia set_debug_port\n"); - set_debug_port = nvidia_set_debug_port; - } -} - -static int __init ehci_setup(void) -{ - struct usb_debug_descriptor dbgp_desc; - u32 cmd, ctrl, status, portsc, hcs_params; - u32 debug_port, new_debug_port = 0, n_ports; - u32 devnum; - int ret, i; - int loop; - int port_map_tried; - int playtimes = 3; - -try_next_time: - port_map_tried = 0; - -try_next_port: - - hcs_params = readl(&ehci_caps->hcs_params); - debug_port = HCS_DEBUG_PORT(hcs_params); - n_ports = HCS_N_PORTS(hcs_params); - - dbgp_printk("debug_port: %d\n", debug_port); - dbgp_printk("n_ports: %d\n", n_ports); - - for (i = 1; i <= n_ports; i++) { - portsc = readl(&ehci_regs->port_status[i-1]); - dbgp_printk("portstatus%d: %08x\n", i, portsc); - } - - if (port_map_tried && (new_debug_port != debug_port)) { - if (--playtimes) { - set_debug_port(new_debug_port); - goto try_next_time; - } - return -1; - } - - loop = 100000; - /* Reset the EHCI controller */ - cmd = readl(&ehci_regs->command); - cmd |= CMD_RESET; - writel(cmd, &ehci_regs->command); - do { - cmd = readl(&ehci_regs->command); - } while ((cmd & CMD_RESET) && (--loop > 0)); - - if (!loop) { - dbgp_printk("can not reset ehci\n"); - return -1; - } - dbgp_printk("ehci reset done\n"); - - /* Claim ownership, but do not enable yet */ - ctrl = readl(&ehci_debug->control); - ctrl |= DBGP_OWNER; - ctrl &= ~(DBGP_ENABLED | DBGP_INUSE); - writel(ctrl, &ehci_debug->control); - - /* Start the ehci running */ - cmd = readl(&ehci_regs->command); - cmd &= ~(CMD_LRESET | CMD_IAAD | CMD_PSE | CMD_ASE | CMD_RESET); - cmd |= CMD_RUN; - writel(cmd, &ehci_regs->command); - - /* Ensure everything is routed to the EHCI */ - writel(FLAG_CF, &ehci_regs->configured_flag); - - /* Wait until the controller is no longer halted */ - loop = 10; - do { - status = readl(&ehci_regs->status); - } while ((status & STS_HALT) && (--loop > 0)); - - if (!loop) { - dbgp_printk("ehci can be started\n"); - return -1; - } - dbgp_printk("ehci started\n"); - - /* Wait for a device to show up in the debug port */ - ret = ehci_wait_for_port(debug_port); - if (ret < 0) { - dbgp_printk("No device found in debug port\n"); - goto next_debug_port; - } - dbgp_printk("ehci wait for port done\n"); - - /* Enable the debug port */ - ctrl = readl(&ehci_debug->control); - ctrl |= DBGP_CLAIM; - writel(ctrl, &ehci_debug->control); - ctrl = readl(&ehci_debug->control); - if ((ctrl & DBGP_CLAIM) != DBGP_CLAIM) { - dbgp_printk("No device in debug port\n"); - writel(ctrl & ~DBGP_CLAIM, &ehci_debug->control); - goto err; - } - dbgp_printk("debug ported enabled\n"); - - /* Completely transfer the debug device to the debug controller */ - portsc = readl(&ehci_regs->port_status[debug_port - 1]); - portsc &= ~PORT_PE; - writel(portsc, &ehci_regs->port_status[debug_port - 1]); - - dbgp_mdelay(100); - - /* Find the debug device and make it device number 127 */ - for (devnum = 0; devnum <= 127; devnum++) { - ret = dbgp_control_msg(devnum, - USB_DIR_IN | USB_TYPE_STANDARD | USB_RECIP_DEVICE, - USB_REQ_GET_DESCRIPTOR, (USB_DT_DEBUG << 8), 0, - &dbgp_desc, sizeof(dbgp_desc)); - if (ret > 0) - break; - } - if (devnum > 127) { - dbgp_printk("Could not find attached debug device\n"); - goto err; - } - if (ret < 0) { - dbgp_printk("Attached device is not a debug device\n"); - goto err; - } - dbgp_endpoint_out = dbgp_desc.bDebugOutEndpoint; - - /* Move the device to 127 if it isn't already there */ - if (devnum != USB_DEBUG_DEVNUM) { - ret = dbgp_control_msg(devnum, - USB_DIR_OUT | USB_TYPE_STANDARD | USB_RECIP_DEVICE, - USB_REQ_SET_ADDRESS, USB_DEBUG_DEVNUM, 0, NULL, 0); - if (ret < 0) { - dbgp_printk("Could not move attached device to %d\n", - USB_DEBUG_DEVNUM); - goto err; - } - devnum = USB_DEBUG_DEVNUM; - dbgp_printk("debug device renamed to 127\n"); - } - - /* Enable the debug interface */ - ret = dbgp_control_msg(USB_DEBUG_DEVNUM, - USB_DIR_OUT | USB_TYPE_STANDARD | USB_RECIP_DEVICE, - USB_REQ_SET_FEATURE, USB_DEVICE_DEBUG_MODE, 0, NULL, 0); - if (ret < 0) { - dbgp_printk(" Could not enable the debug device\n"); - goto err; - } - dbgp_printk("debug interface enabled\n"); - - /* Perform a small write to get the even/odd data state in sync - */ - ret = dbgp_bulk_write(USB_DEBUG_DEVNUM, dbgp_endpoint_out, " ", 1); - if (ret < 0) { - dbgp_printk("dbgp_bulk_write failed: %d\n", ret); - goto err; - } - dbgp_printk("small write doned\n"); - - return 0; -err: - /* Things didn't work so remove my claim */ - ctrl = readl(&ehci_debug->control); - ctrl &= ~(DBGP_CLAIM | DBGP_OUT); - writel(ctrl, &ehci_debug->control); - return -1; - -next_debug_port: - port_map_tried |= (1<<(debug_port - 1)); - new_debug_port = ((debug_port-1+1)%n_ports) + 1; - if (port_map_tried != ((1<> 29) & 0x7; - bar = (bar * 4) + 0xc; - offset = (debug_port >> 16) & 0xfff; - dbgp_printk("bar: %02x offset: %03x\n", bar, offset); - if (bar != PCI_BASE_ADDRESS_0) { - dbgp_printk("only debug ports on bar 1 handled.\n"); - - return -1; - } - - bar_val = read_pci_config(bus, slot, func, PCI_BASE_ADDRESS_0); - dbgp_printk("bar_val: %02x offset: %03x\n", bar_val, offset); - if (bar_val & ~PCI_BASE_ADDRESS_MEM_MASK) { - dbgp_printk("only simple 32bit mmio bars supported\n"); - - return -1; - } - - /* double check if the mem space is enabled */ - byte = read_pci_config_byte(bus, slot, func, 0x04); - if (!(byte & 0x2)) { - byte |= 0x02; - write_pci_config_byte(bus, slot, func, 0x04, byte); - dbgp_printk("mmio for ehci enabled\n"); - } - - /* - * FIXME I don't have the bar size so just guess PAGE_SIZE is more - * than enough. 1K is the biggest I have seen. - */ - set_fixmap_nocache(FIX_DBGP_BASE, bar_val & PAGE_MASK); - ehci_bar = (void __iomem *)__fix_to_virt(FIX_DBGP_BASE); - ehci_bar += bar_val & ~PAGE_MASK; - dbgp_printk("ehci_bar: %p\n", ehci_bar); - - ehci_caps = ehci_bar; - ehci_regs = ehci_bar + HC_LENGTH(readl(&ehci_caps->hc_capbase)); - ehci_debug = ehci_bar + offset; - ehci_dev.bus = bus; - ehci_dev.slot = slot; - ehci_dev.func = func; - - detect_set_debug_port(); - - ret = ehci_setup(); - if (ret < 0) { - dbgp_printk("ehci_setup failed\n"); - ehci_debug = NULL; - - return -1; - } - - return 0; -} - -static void early_dbgp_write(struct console *con, const char *str, u32 n) -{ - int chunk, ret; - - if (!ehci_debug) - return; - while (n > 0) { - chunk = n; - if (chunk > DBGP_MAX_PACKET) - chunk = DBGP_MAX_PACKET; - ret = dbgp_bulk_write(USB_DEBUG_DEVNUM, - dbgp_endpoint_out, str, chunk); - str += chunk; - n -= chunk; - } -} - -static struct console early_dbgp_console = { - .name = "earlydbg", - .write = early_dbgp_write, - .flags = CON_PRINTBUFFER, - .index = -1, -}; -#endif - /* Direct interface for emergencies */ static struct console *early_console = &early_vga_console; static int __initdata early_console_initialized; diff --git a/drivers/usb/Makefile b/drivers/usb/Makefile index dbe4fb694ad2..be3c9b80bc9f 100644 --- a/drivers/usb/Makefile +++ b/drivers/usb/Makefile @@ -40,6 +40,7 @@ obj-$(CONFIG_USB_MICROTEK) += image/ obj-$(CONFIG_USB_SERIAL) += serial/ obj-$(CONFIG_USB) += misc/ +obj-y += early/ obj-$(CONFIG_USB_ATM) += atm/ obj-$(CONFIG_USB_SPEEDTOUCH) += atm/ diff --git a/drivers/usb/early/Makefile b/drivers/usb/early/Makefile new file mode 100644 index 000000000000..dfedee8c45b6 --- /dev/null +++ b/drivers/usb/early/Makefile @@ -0,0 +1,5 @@ +# +# Makefile for early USB devices +# + +obj-$(CONFIG_EARLY_PRINTK_DBGP) += ehci-dbgp.o diff --git a/drivers/usb/early/ehci-dbgp.c b/drivers/usb/early/ehci-dbgp.c new file mode 100644 index 000000000000..821b7b21c29c --- /dev/null +++ b/drivers/usb/early/ehci-dbgp.c @@ -0,0 +1,723 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static struct ehci_caps __iomem *ehci_caps; +static struct ehci_regs __iomem *ehci_regs; +static struct ehci_dbg_port __iomem *ehci_debug; +static unsigned int dbgp_endpoint_out; + +struct ehci_dev { + u32 bus; + u32 slot; + u32 func; +}; + +static struct ehci_dev ehci_dev; + +#define USB_DEBUG_DEVNUM 127 + +#define DBGP_DATA_TOGGLE 0x8800 + +static inline u32 dbgp_pid_update(u32 x, u32 tok) +{ + return ((x ^ DBGP_DATA_TOGGLE) & 0xffff00) | (tok & 0xff); +} + +static inline u32 dbgp_len_update(u32 x, u32 len) +{ + return (x & ~0x0f) | (len & 0x0f); +} + +/* + * USB Packet IDs (PIDs) + */ + +/* token */ +#define USB_PID_OUT 0xe1 +#define USB_PID_IN 0x69 +#define USB_PID_SOF 0xa5 +#define USB_PID_SETUP 0x2d +/* handshake */ +#define USB_PID_ACK 0xd2 +#define USB_PID_NAK 0x5a +#define USB_PID_STALL 0x1e +#define USB_PID_NYET 0x96 +/* data */ +#define USB_PID_DATA0 0xc3 +#define USB_PID_DATA1 0x4b +#define USB_PID_DATA2 0x87 +#define USB_PID_MDATA 0x0f +/* Special */ +#define USB_PID_PREAMBLE 0x3c +#define USB_PID_ERR 0x3c +#define USB_PID_SPLIT 0x78 +#define USB_PID_PING 0xb4 +#define USB_PID_UNDEF_0 0xf0 + +#define USB_PID_DATA_TOGGLE 0x88 +#define DBGP_CLAIM (DBGP_OWNER | DBGP_ENABLED | DBGP_INUSE) + +#define PCI_CAP_ID_EHCI_DEBUG 0xa + +#define HUB_ROOT_RESET_TIME 50 /* times are in msec */ +#define HUB_SHORT_RESET_TIME 10 +#define HUB_LONG_RESET_TIME 200 +#define HUB_RESET_TIMEOUT 500 + +#define DBGP_MAX_PACKET 8 + +static int dbgp_wait_until_complete(void) +{ + u32 ctrl; + int loop = 0x100000; + + do { + ctrl = readl(&ehci_debug->control); + /* Stop when the transaction is finished */ + if (ctrl & DBGP_DONE) + break; + } while (--loop > 0); + + if (!loop) + return -1; + + /* + * Now that we have observed the completed transaction, + * clear the done bit. + */ + writel(ctrl | DBGP_DONE, &ehci_debug->control); + return (ctrl & DBGP_ERROR) ? -DBGP_ERRCODE(ctrl) : DBGP_LEN(ctrl); +} + +static void __init dbgp_mdelay(int ms) +{ + int i; + + while (ms--) { + for (i = 0; i < 1000; i++) + outb(0x1, 0x80); + } +} + +static void dbgp_breath(void) +{ + /* Sleep to give the debug port a chance to breathe */ +} + +static int dbgp_wait_until_done(unsigned ctrl) +{ + u32 pids, lpid; + int ret; + int loop = 3; + +retry: + writel(ctrl | DBGP_GO, &ehci_debug->control); + ret = dbgp_wait_until_complete(); + pids = readl(&ehci_debug->pids); + lpid = DBGP_PID_GET(pids); + + if (ret < 0) + return ret; + + /* + * If the port is getting full or it has dropped data + * start pacing ourselves, not necessary but it's friendly. + */ + if ((lpid == USB_PID_NAK) || (lpid == USB_PID_NYET)) + dbgp_breath(); + + /* If I get a NACK reissue the transmission */ + if (lpid == USB_PID_NAK) { + if (--loop > 0) + goto retry; + } + + return ret; +} + +static void dbgp_set_data(const void *buf, int size) +{ + const unsigned char *bytes = buf; + u32 lo, hi; + int i; + + lo = hi = 0; + for (i = 0; i < 4 && i < size; i++) + lo |= bytes[i] << (8*i); + for (; i < 8 && i < size; i++) + hi |= bytes[i] << (8*(i - 4)); + writel(lo, &ehci_debug->data03); + writel(hi, &ehci_debug->data47); +} + +static void __init dbgp_get_data(void *buf, int size) +{ + unsigned char *bytes = buf; + u32 lo, hi; + int i; + + lo = readl(&ehci_debug->data03); + hi = readl(&ehci_debug->data47); + for (i = 0; i < 4 && i < size; i++) + bytes[i] = (lo >> (8*i)) & 0xff; + for (; i < 8 && i < size; i++) + bytes[i] = (hi >> (8*(i - 4))) & 0xff; +} + +static int dbgp_bulk_write(unsigned devnum, unsigned endpoint, + const char *bytes, int size) +{ + u32 pids, addr, ctrl; + int ret; + + if (size > DBGP_MAX_PACKET) + return -1; + + addr = DBGP_EPADDR(devnum, endpoint); + + pids = readl(&ehci_debug->pids); + pids = dbgp_pid_update(pids, USB_PID_OUT); + + ctrl = readl(&ehci_debug->control); + ctrl = dbgp_len_update(ctrl, size); + ctrl |= DBGP_OUT; + ctrl |= DBGP_GO; + + dbgp_set_data(bytes, size); + writel(addr, &ehci_debug->address); + writel(pids, &ehci_debug->pids); + + ret = dbgp_wait_until_done(ctrl); + if (ret < 0) + return ret; + + return ret; +} + +static int __init dbgp_bulk_read(unsigned devnum, unsigned endpoint, void *data, + int size) +{ + u32 pids, addr, ctrl; + int ret; + + if (size > DBGP_MAX_PACKET) + return -1; + + addr = DBGP_EPADDR(devnum, endpoint); + + pids = readl(&ehci_debug->pids); + pids = dbgp_pid_update(pids, USB_PID_IN); + + ctrl = readl(&ehci_debug->control); + ctrl = dbgp_len_update(ctrl, size); + ctrl &= ~DBGP_OUT; + ctrl |= DBGP_GO; + + writel(addr, &ehci_debug->address); + writel(pids, &ehci_debug->pids); + ret = dbgp_wait_until_done(ctrl); + if (ret < 0) + return ret; + + if (size > ret) + size = ret; + dbgp_get_data(data, size); + return ret; +} + +static int __init dbgp_control_msg(unsigned devnum, int requesttype, + int request, int value, int index, void *data, int size) +{ + u32 pids, addr, ctrl; + struct usb_ctrlrequest req; + int read; + int ret; + + read = (requesttype & USB_DIR_IN) != 0; + if (size > (read ? DBGP_MAX_PACKET:0)) + return -1; + + /* Compute the control message */ + req.bRequestType = requesttype; + req.bRequest = request; + req.wValue = cpu_to_le16(value); + req.wIndex = cpu_to_le16(index); + req.wLength = cpu_to_le16(size); + + pids = DBGP_PID_SET(USB_PID_DATA0, USB_PID_SETUP); + addr = DBGP_EPADDR(devnum, 0); + + ctrl = readl(&ehci_debug->control); + ctrl = dbgp_len_update(ctrl, sizeof(req)); + ctrl |= DBGP_OUT; + ctrl |= DBGP_GO; + + /* Send the setup message */ + dbgp_set_data(&req, sizeof(req)); + writel(addr, &ehci_debug->address); + writel(pids, &ehci_debug->pids); + ret = dbgp_wait_until_done(ctrl); + if (ret < 0) + return ret; + + /* Read the result */ + return dbgp_bulk_read(devnum, 0, data, size); +} + + +/* Find a PCI capability */ +static u32 __init find_cap(u32 num, u32 slot, u32 func, int cap) +{ + u8 pos; + int bytes; + + if (!(read_pci_config_16(num, slot, func, PCI_STATUS) & + PCI_STATUS_CAP_LIST)) + return 0; + + pos = read_pci_config_byte(num, slot, func, PCI_CAPABILITY_LIST); + for (bytes = 0; bytes < 48 && pos >= 0x40; bytes++) { + u8 id; + + pos &= ~3; + id = read_pci_config_byte(num, slot, func, pos+PCI_CAP_LIST_ID); + if (id == 0xff) + break; + if (id == cap) + return pos; + + pos = read_pci_config_byte(num, slot, func, + pos+PCI_CAP_LIST_NEXT); + } + return 0; +} + +static u32 __init __find_dbgp(u32 bus, u32 slot, u32 func) +{ + u32 class; + + class = read_pci_config(bus, slot, func, PCI_CLASS_REVISION); + if ((class >> 8) != PCI_CLASS_SERIAL_USB_EHCI) + return 0; + + return find_cap(bus, slot, func, PCI_CAP_ID_EHCI_DEBUG); +} + +static u32 __init find_dbgp(int ehci_num, u32 *rbus, u32 *rslot, u32 *rfunc) +{ + u32 bus, slot, func; + + for (bus = 0; bus < 256; bus++) { + for (slot = 0; slot < 32; slot++) { + for (func = 0; func < 8; func++) { + unsigned cap; + + cap = __find_dbgp(bus, slot, func); + + if (!cap) + continue; + if (ehci_num-- != 0) + continue; + *rbus = bus; + *rslot = slot; + *rfunc = func; + return cap; + } + } + } + return 0; +} + +static int __init ehci_reset_port(int port) +{ + u32 portsc; + u32 delay_time, delay; + int loop; + + /* Reset the usb debug port */ + portsc = readl(&ehci_regs->port_status[port - 1]); + portsc &= ~PORT_PE; + portsc |= PORT_RESET; + writel(portsc, &ehci_regs->port_status[port - 1]); + + delay = HUB_ROOT_RESET_TIME; + for (delay_time = 0; delay_time < HUB_RESET_TIMEOUT; + delay_time += delay) { + dbgp_mdelay(delay); + + portsc = readl(&ehci_regs->port_status[port - 1]); + if (portsc & PORT_RESET) { + /* force reset to complete */ + loop = 2; + writel(portsc & ~(PORT_RWC_BITS | PORT_RESET), + &ehci_regs->port_status[port - 1]); + do { + portsc = readl(&ehci_regs->port_status[port-1]); + } while ((portsc & PORT_RESET) && (--loop > 0)); + } + + /* Device went away? */ + if (!(portsc & PORT_CONNECT)) + return -ENOTCONN; + + /* bomb out completely if something weird happend */ + if ((portsc & PORT_CSC)) + return -EINVAL; + + /* If we've finished resetting, then break out of the loop */ + if (!(portsc & PORT_RESET) && (portsc & PORT_PE)) + return 0; + } + return -EBUSY; +} + +static int __init ehci_wait_for_port(int port) +{ + u32 status; + int ret, reps; + + for (reps = 0; reps < 3; reps++) { + dbgp_mdelay(100); + status = readl(&ehci_regs->status); + if (status & STS_PCD) { + ret = ehci_reset_port(port); + if (ret == 0) + return 0; + } + } + return -ENOTCONN; +} + +#ifdef DBGP_DEBUG +# define dbgp_printk early_printk +#else +static inline void dbgp_printk(const char *fmt, ...) { } +#endif + +typedef void (*set_debug_port_t)(int port); + +static void __init default_set_debug_port(int port) +{ +} + +static set_debug_port_t __initdata set_debug_port = default_set_debug_port; + +static void __init nvidia_set_debug_port(int port) +{ + u32 dword; + dword = read_pci_config(ehci_dev.bus, ehci_dev.slot, ehci_dev.func, + 0x74); + dword &= ~(0x0f<<12); + dword |= ((port & 0x0f)<<12); + write_pci_config(ehci_dev.bus, ehci_dev.slot, ehci_dev.func, 0x74, + dword); + dbgp_printk("set debug port to %d\n", port); +} + +static void __init detect_set_debug_port(void) +{ + u32 vendorid; + + vendorid = read_pci_config(ehci_dev.bus, ehci_dev.slot, ehci_dev.func, + 0x00); + + if ((vendorid & 0xffff) == 0x10de) { + dbgp_printk("using nvidia set_debug_port\n"); + set_debug_port = nvidia_set_debug_port; + } +} + +static int __init ehci_setup(void) +{ + struct usb_debug_descriptor dbgp_desc; + u32 cmd, ctrl, status, portsc, hcs_params; + u32 debug_port, new_debug_port = 0, n_ports; + u32 devnum; + int ret, i; + int loop; + int port_map_tried; + int playtimes = 3; + +try_next_time: + port_map_tried = 0; + +try_next_port: + + hcs_params = readl(&ehci_caps->hcs_params); + debug_port = HCS_DEBUG_PORT(hcs_params); + n_ports = HCS_N_PORTS(hcs_params); + + dbgp_printk("debug_port: %d\n", debug_port); + dbgp_printk("n_ports: %d\n", n_ports); + + for (i = 1; i <= n_ports; i++) { + portsc = readl(&ehci_regs->port_status[i-1]); + dbgp_printk("portstatus%d: %08x\n", i, portsc); + } + + if (port_map_tried && (new_debug_port != debug_port)) { + if (--playtimes) { + set_debug_port(new_debug_port); + goto try_next_time; + } + return -1; + } + + loop = 100000; + /* Reset the EHCI controller */ + cmd = readl(&ehci_regs->command); + cmd |= CMD_RESET; + writel(cmd, &ehci_regs->command); + do { + cmd = readl(&ehci_regs->command); + } while ((cmd & CMD_RESET) && (--loop > 0)); + + if (!loop) { + dbgp_printk("can not reset ehci\n"); + return -1; + } + dbgp_printk("ehci reset done\n"); + + /* Claim ownership, but do not enable yet */ + ctrl = readl(&ehci_debug->control); + ctrl |= DBGP_OWNER; + ctrl &= ~(DBGP_ENABLED | DBGP_INUSE); + writel(ctrl, &ehci_debug->control); + + /* Start the ehci running */ + cmd = readl(&ehci_regs->command); + cmd &= ~(CMD_LRESET | CMD_IAAD | CMD_PSE | CMD_ASE | CMD_RESET); + cmd |= CMD_RUN; + writel(cmd, &ehci_regs->command); + + /* Ensure everything is routed to the EHCI */ + writel(FLAG_CF, &ehci_regs->configured_flag); + + /* Wait until the controller is no longer halted */ + loop = 10; + do { + status = readl(&ehci_regs->status); + } while ((status & STS_HALT) && (--loop > 0)); + + if (!loop) { + dbgp_printk("ehci can be started\n"); + return -1; + } + dbgp_printk("ehci started\n"); + + /* Wait for a device to show up in the debug port */ + ret = ehci_wait_for_port(debug_port); + if (ret < 0) { + dbgp_printk("No device found in debug port\n"); + goto next_debug_port; + } + dbgp_printk("ehci wait for port done\n"); + + /* Enable the debug port */ + ctrl = readl(&ehci_debug->control); + ctrl |= DBGP_CLAIM; + writel(ctrl, &ehci_debug->control); + ctrl = readl(&ehci_debug->control); + if ((ctrl & DBGP_CLAIM) != DBGP_CLAIM) { + dbgp_printk("No device in debug port\n"); + writel(ctrl & ~DBGP_CLAIM, &ehci_debug->control); + goto err; + } + dbgp_printk("debug ported enabled\n"); + + /* Completely transfer the debug device to the debug controller */ + portsc = readl(&ehci_regs->port_status[debug_port - 1]); + portsc &= ~PORT_PE; + writel(portsc, &ehci_regs->port_status[debug_port - 1]); + + dbgp_mdelay(100); + + /* Find the debug device and make it device number 127 */ + for (devnum = 0; devnum <= 127; devnum++) { + ret = dbgp_control_msg(devnum, + USB_DIR_IN | USB_TYPE_STANDARD | USB_RECIP_DEVICE, + USB_REQ_GET_DESCRIPTOR, (USB_DT_DEBUG << 8), 0, + &dbgp_desc, sizeof(dbgp_desc)); + if (ret > 0) + break; + } + if (devnum > 127) { + dbgp_printk("Could not find attached debug device\n"); + goto err; + } + if (ret < 0) { + dbgp_printk("Attached device is not a debug device\n"); + goto err; + } + dbgp_endpoint_out = dbgp_desc.bDebugOutEndpoint; + + /* Move the device to 127 if it isn't already there */ + if (devnum != USB_DEBUG_DEVNUM) { + ret = dbgp_control_msg(devnum, + USB_DIR_OUT | USB_TYPE_STANDARD | USB_RECIP_DEVICE, + USB_REQ_SET_ADDRESS, USB_DEBUG_DEVNUM, 0, NULL, 0); + if (ret < 0) { + dbgp_printk("Could not move attached device to %d\n", + USB_DEBUG_DEVNUM); + goto err; + } + devnum = USB_DEBUG_DEVNUM; + dbgp_printk("debug device renamed to 127\n"); + } + + /* Enable the debug interface */ + ret = dbgp_control_msg(USB_DEBUG_DEVNUM, + USB_DIR_OUT | USB_TYPE_STANDARD | USB_RECIP_DEVICE, + USB_REQ_SET_FEATURE, USB_DEVICE_DEBUG_MODE, 0, NULL, 0); + if (ret < 0) { + dbgp_printk(" Could not enable the debug device\n"); + goto err; + } + dbgp_printk("debug interface enabled\n"); + + /* Perform a small write to get the even/odd data state in sync + */ + ret = dbgp_bulk_write(USB_DEBUG_DEVNUM, dbgp_endpoint_out, " ", 1); + if (ret < 0) { + dbgp_printk("dbgp_bulk_write failed: %d\n", ret); + goto err; + } + dbgp_printk("small write doned\n"); + + return 0; +err: + /* Things didn't work so remove my claim */ + ctrl = readl(&ehci_debug->control); + ctrl &= ~(DBGP_CLAIM | DBGP_OUT); + writel(ctrl, &ehci_debug->control); + return -1; + +next_debug_port: + port_map_tried |= (1<<(debug_port - 1)); + new_debug_port = ((debug_port-1+1)%n_ports) + 1; + if (port_map_tried != ((1<> 29) & 0x7; + bar = (bar * 4) + 0xc; + offset = (debug_port >> 16) & 0xfff; + dbgp_printk("bar: %02x offset: %03x\n", bar, offset); + if (bar != PCI_BASE_ADDRESS_0) { + dbgp_printk("only debug ports on bar 1 handled.\n"); + + return -1; + } + + bar_val = read_pci_config(bus, slot, func, PCI_BASE_ADDRESS_0); + dbgp_printk("bar_val: %02x offset: %03x\n", bar_val, offset); + if (bar_val & ~PCI_BASE_ADDRESS_MEM_MASK) { + dbgp_printk("only simple 32bit mmio bars supported\n"); + + return -1; + } + + /* double check if the mem space is enabled */ + byte = read_pci_config_byte(bus, slot, func, 0x04); + if (!(byte & 0x2)) { + byte |= 0x02; + write_pci_config_byte(bus, slot, func, 0x04, byte); + dbgp_printk("mmio for ehci enabled\n"); + } + + /* + * FIXME I don't have the bar size so just guess PAGE_SIZE is more + * than enough. 1K is the biggest I have seen. + */ + set_fixmap_nocache(FIX_DBGP_BASE, bar_val & PAGE_MASK); + ehci_bar = (void __iomem *)__fix_to_virt(FIX_DBGP_BASE); + ehci_bar += bar_val & ~PAGE_MASK; + dbgp_printk("ehci_bar: %p\n", ehci_bar); + + ehci_caps = ehci_bar; + ehci_regs = ehci_bar + HC_LENGTH(readl(&ehci_caps->hc_capbase)); + ehci_debug = ehci_bar + offset; + ehci_dev.bus = bus; + ehci_dev.slot = slot; + ehci_dev.func = func; + + detect_set_debug_port(); + + ret = ehci_setup(); + if (ret < 0) { + dbgp_printk("ehci_setup failed\n"); + ehci_debug = NULL; + + return -1; + } + + return 0; +} + +static void early_dbgp_write(struct console *con, const char *str, u32 n) +{ + int chunk, ret; + + if (!ehci_debug) + return; + while (n > 0) { + chunk = n; + if (chunk > DBGP_MAX_PACKET) + chunk = DBGP_MAX_PACKET; + ret = dbgp_bulk_write(USB_DEBUG_DEVNUM, + dbgp_endpoint_out, str, chunk); + str += chunk; + n -= chunk; + } +} + +struct console early_dbgp_console = { + .name = "earlydbg", + .write = early_dbgp_write, + .flags = CON_PRINTBUFFER, + .index = -1, +}; + diff --git a/include/linux/usb/ehci_def.h b/include/linux/usb/ehci_def.h index 4c4b701ff265..1deac2a7b12e 100644 --- a/include/linux/usb/ehci_def.h +++ b/include/linux/usb/ehci_def.h @@ -170,4 +170,10 @@ struct ehci_dbg_port { #define DBGP_EPADDR(dev, ep) (((dev)<<8)|(ep)) } __attribute__ ((packed)); +#ifdef CONFIG_EARLY_PRINTK_DBGP +#include +extern int __init early_dbgp_init(char *s); +extern struct console early_dbgp_console; +#endif /* CONFIG_EARLY_PRINTK_DBGP */ + #endif /* __LINUX_USB_EHCI_DEF_H */ -- cgit v1.2.3 From c9530948bc626c8b638015c0b32abb9615659ec6 Mon Sep 17 00:00:00 2001 From: Jason Wessel Date: Thu, 20 Aug 2009 15:39:52 -0500 Subject: early_printk: Allow more than one early console It is desirable to be able to use one early boot device to debug another or to have multiple places you can see the early boot diagnostics, such as the vga screen or serial device. This patch changes the early_printk console device registration to allow more than one early printk device to get registered via register_console(). Signed-off-by: Jason Wessel Cc: Ingo Molnar Cc: Andrew Morton Cc: Yinghai Lu Cc: "Eric W. Biederman" Cc: Randy Dunlap Signed-off-by: Greg Kroah-Hartman --- arch/x86/kernel/early_printk.c | 65 +++++++++++++++++++++--------------------- 1 file changed, 33 insertions(+), 32 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/early_printk.c b/arch/x86/kernel/early_printk.c index 519a5e10be53..2acfd3fdc0cc 100644 --- a/arch/x86/kernel/early_printk.c +++ b/arch/x86/kernel/early_printk.c @@ -176,10 +176,19 @@ asmlinkage void early_printk(const char *fmt, ...) va_end(ap); } +static inline void early_console_register(struct console *con, int keep_early) +{ + early_console = con; + if (keep_early) + early_console->flags &= ~CON_BOOT; + else + early_console->flags |= CON_BOOT; + register_console(early_console); +} static int __init setup_early_printk(char *buf) { - int keep_early; + int keep; if (!buf) return 0; @@ -188,42 +197,34 @@ static int __init setup_early_printk(char *buf) return 0; early_console_initialized = 1; - keep_early = (strstr(buf, "keep") != NULL); - - if (!strncmp(buf, "serial", 6)) { - early_serial_init(buf + 6); - early_console = &early_serial_console; - } else if (!strncmp(buf, "ttyS", 4)) { - early_serial_init(buf); - early_console = &early_serial_console; - } else if (!strncmp(buf, "vga", 3) - && boot_params.screen_info.orig_video_isVGA == 1) { - max_xpos = boot_params.screen_info.orig_video_cols; - max_ypos = boot_params.screen_info.orig_video_lines; - current_ypos = boot_params.screen_info.orig_y; - early_console = &early_vga_console; + keep = (strstr(buf, "keep") != NULL); + + while (*buf != '\0') { + if (!strncmp(buf, "serial", 6)) { + early_serial_init(buf + 6); + early_console_register(&early_serial_console, keep); + } + if (!strncmp(buf, "ttyS", 4)) { + early_serial_init(buf + 4); + early_console_register(&early_serial_console, keep); + } + if (!strncmp(buf, "vga", 3) && + boot_params.screen_info.orig_video_isVGA == 1) { + max_xpos = boot_params.screen_info.orig_video_cols; + max_ypos = boot_params.screen_info.orig_video_lines; + current_ypos = boot_params.screen_info.orig_y; + early_console_register(&early_vga_console, keep); + } #ifdef CONFIG_EARLY_PRINTK_DBGP - } else if (!strncmp(buf, "dbgp", 4)) { - if (early_dbgp_init(buf+4) < 0) - return 0; - early_console = &early_dbgp_console; - /* - * usb subsys will reset ehci controller, so don't keep - * that early console - */ - keep_early = 0; + if (!strncmp(buf, "dbgp", 4) && !early_dbgp_init(buf + 4)) + early_console_register(&early_dbgp_console, keep); #endif #ifdef CONFIG_HVC_XEN - } else if (!strncmp(buf, "xen", 3)) { - early_console = &xenboot_console; + if (!strncmp(buf, "xen", 3)) + early_console_register(&xenboot_console, keep); #endif + buf++; } - - if (keep_early) - early_console->flags &= ~CON_BOOT; - else - early_console->flags |= CON_BOOT; - register_console(early_console); return 0; } -- cgit v1.2.3 From 11868a2dc4f5e4f2f652bfd259e1360193fcee62 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 23 Sep 2009 17:49:55 +0200 Subject: x86: mce: Use safer ways to access MCE registers Use rdmsrl_safe() when accessing MCE registers. While in theory we always 'know' which ones are safe to access from the capability bits, there's a lot of hardware variations and reality might differ from theory, as it did in this case: http://bugzilla.kernel.org/show_bug.cgi?id=14204 [ 0.010016] mce: CPU supports 5 MCE banks [ 0.011029] general protection fault: 0000 [#1] [ 0.011998] last sysfs file: [ 0.011998] Modules linked in: [ 0.011998] [ 0.011998] Pid: 0, comm: swapper Not tainted (2.6.31_router #1) HP Vectra [ 0.011998] EIP: 0060:[] EFLAGS: 00010246 CPU: 0 [ 0.011998] EIP is at mce_rdmsrl+0x19/0x60 [ 0.011998] EAX: 00000000 EBX: 00000001 ECX: 00000407 EDX: 08000000 [ 0.011998] ESI: 00000000 EDI: 8c000000 EBP: 00000405 ESP: c17d5eac So WARN_ONCE() instead of crashing the box. ( also fix a number of stylistic inconsistencies in the code. ) Note, we might still crash in wrmsrl() if we get that far, but we shouldnt if the registers are truly inaccessible. Reported-by: GNUtoo Cc: Hidetoshi Seto Cc: Huang Ying Cc: Andi Kleen LKML-Reference: Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/mcheck/mce.c | 23 +++++++++++++++++++++-- 1 file changed, 21 insertions(+), 2 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 2f5aab26320e..4b2af86e3e8d 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -305,13 +305,25 @@ static int msr_to_offset(u32 msr) static u64 mce_rdmsrl(u32 msr) { u64 v; + if (__get_cpu_var(injectm).finished) { int offset = msr_to_offset(msr); + if (offset < 0) return 0; return *(u64 *)((char *)&__get_cpu_var(injectm) + offset); } - rdmsrl(msr, v); + + if (rdmsrl_safe(msr, &v)) { + WARN_ONCE(1, "mce: Unable to read msr %d!\n", msr); + /* + * Return zero in case the access faulted. This should + * not happen normally but can happen if the CPU does + * something weird, or if the code is buggy. + */ + v = 0; + } + return v; } @@ -319,6 +331,7 @@ static void mce_wrmsrl(u32 msr, u64 v) { if (__get_cpu_var(injectm).finished) { int offset = msr_to_offset(msr); + if (offset >= 0) *(u64 *)((char *)&__get_cpu_var(injectm) + offset) = v; return; @@ -415,7 +428,7 @@ static inline void mce_get_rip(struct mce *m, struct pt_regs *regs) m->ip = mce_rdmsrl(rip_msr); } -#ifdef CONFIG_X86_LOCAL_APIC +#ifdef CONFIG_X86_LOCAL_APIC /* * Called after interrupts have been reenabled again * when a MCE happened during an interrupts off region @@ -1172,6 +1185,7 @@ static int mce_banks_init(void) return -ENOMEM; for (i = 0; i < banks; i++) { struct mce_bank *b = &mce_banks[i]; + b->ctl = -1ULL; b->init = 1; } @@ -1203,6 +1217,7 @@ static int __cpuinit mce_cap_init(void) banks = b; if (!mce_banks) { int err = mce_banks_init(); + if (err) return err; } @@ -1237,6 +1252,7 @@ static void mce_init(void) for (i = 0; i < banks; i++) { struct mce_bank *b = &mce_banks[i]; + if (!b->init) continue; wrmsrl(MSR_IA32_MCx_CTL(i), b->ctl); @@ -1626,6 +1642,7 @@ static int mce_disable(void) for (i = 0; i < banks; i++) { struct mce_bank *b = &mce_banks[i]; + if (b->init) wrmsrl(MSR_IA32_MCx_CTL(i), 0); } @@ -1911,6 +1928,7 @@ static void mce_disable_cpu(void *h) cmci_clear(); for (i = 0; i < banks; i++) { struct mce_bank *b = &mce_banks[i]; + if (b->init) wrmsrl(MSR_IA32_MCx_CTL(i), 0); } @@ -1928,6 +1946,7 @@ static void mce_reenable_cpu(void *h) cmci_reenable(); for (i = 0; i < banks; i++) { struct mce_bank *b = &mce_banks[i]; + if (b->init) wrmsrl(MSR_IA32_MCx_CTL(i), b->ctl); } -- cgit v1.2.3 From 79f5599772ac2f138d7a75b8f3f06a93f09c75f7 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Mon, 15 Jun 2009 14:58:26 +0800 Subject: cpumask: use zalloc_cpumask_var() where possible Remove open-coded zalloc_cpumask_var() and zalloc_cpumask_var_node(). Signed-off-by: Li Zefan Signed-off-by: Rusty Russell --- arch/x86/kernel/apic/io_apic.c | 7 ++----- arch/x86/kernel/process.c | 6 ++---- arch/x86/kernel/smpboot.c | 9 +++------ drivers/acpi/processor_perflib.c | 3 +-- drivers/acpi/processor_throttling.c | 3 +-- drivers/net/sfc/efx.c | 3 +-- drivers/oprofile/buffer_sync.c | 3 +-- kernel/trace/trace.c | 7 ++----- virt/kvm/kvm_main.c | 3 +-- 9 files changed, 14 insertions(+), 30 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 64970b9885f2..dc69f28489f5 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -227,17 +227,14 @@ static struct irq_cfg *get_one_free_irq_cfg(int node) cfg = kzalloc_node(sizeof(*cfg), GFP_ATOMIC, node); if (cfg) { - if (!alloc_cpumask_var_node(&cfg->domain, GFP_ATOMIC, node)) { + if (!zalloc_cpumask_var_node(&cfg->domain, GFP_ATOMIC, node)) { kfree(cfg); cfg = NULL; - } else if (!alloc_cpumask_var_node(&cfg->old_domain, + } else if (!zalloc_cpumask_var_node(&cfg->old_domain, GFP_ATOMIC, node)) { free_cpumask_var(cfg->domain); kfree(cfg); cfg = NULL; - } else { - cpumask_clear(cfg->domain); - cpumask_clear(cfg->old_domain); } } diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index 847ab4160315..5284cd2b5776 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -555,10 +555,8 @@ void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c) void __init init_c1e_mask(void) { /* If we're using c1e_idle, we need to allocate c1e_mask. */ - if (pm_idle == c1e_idle) { - alloc_cpumask_var(&c1e_mask, GFP_KERNEL); - cpumask_clear(c1e_mask); - } + if (pm_idle == c1e_idle) + zalloc_cpumask_var(&c1e_mask, GFP_KERNEL); } static int __init idle_setup(char *str) diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 09c5e077dff7..565ebc65920e 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -1059,12 +1059,9 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus) #endif current_thread_info()->cpu = 0; /* needed? */ for_each_possible_cpu(i) { - alloc_cpumask_var(&per_cpu(cpu_sibling_map, i), GFP_KERNEL); - alloc_cpumask_var(&per_cpu(cpu_core_map, i), GFP_KERNEL); - alloc_cpumask_var(&cpu_data(i).llc_shared_map, GFP_KERNEL); - cpumask_clear(per_cpu(cpu_core_map, i)); - cpumask_clear(per_cpu(cpu_sibling_map, i)); - cpumask_clear(cpu_data(i).llc_shared_map); + zalloc_cpumask_var(&per_cpu(cpu_sibling_map, i), GFP_KERNEL); + zalloc_cpumask_var(&per_cpu(cpu_core_map, i), GFP_KERNEL); + zalloc_cpumask_var(&cpu_data(i).llc_shared_map, GFP_KERNEL); } set_cpu_sibling_map(0); diff --git a/drivers/acpi/processor_perflib.c b/drivers/acpi/processor_perflib.c index 11088cf10319..8ba0ed0b9ddb 100644 --- a/drivers/acpi/processor_perflib.c +++ b/drivers/acpi/processor_perflib.c @@ -511,7 +511,7 @@ int acpi_processor_preregister_performance( struct acpi_processor *match_pr; struct acpi_psd_package *match_pdomain; - if (!alloc_cpumask_var(&covered_cpus, GFP_KERNEL)) + if (!zalloc_cpumask_var(&covered_cpus, GFP_KERNEL)) return -ENOMEM; mutex_lock(&performance_mutex); @@ -558,7 +558,6 @@ int acpi_processor_preregister_performance( * Now that we have _PSD data from all CPUs, lets setup P-state * domain info. */ - cpumask_clear(covered_cpus); for_each_possible_cpu(i) { pr = per_cpu(processors, i); if (!pr) diff --git a/drivers/acpi/processor_throttling.c b/drivers/acpi/processor_throttling.c index ce7cf3bc5101..4c6c14c1e307 100644 --- a/drivers/acpi/processor_throttling.c +++ b/drivers/acpi/processor_throttling.c @@ -77,7 +77,7 @@ static int acpi_processor_update_tsd_coord(void) struct acpi_tsd_package *pdomain, *match_pdomain; struct acpi_processor_throttling *pthrottling, *match_pthrottling; - if (!alloc_cpumask_var(&covered_cpus, GFP_KERNEL)) + if (!zalloc_cpumask_var(&covered_cpus, GFP_KERNEL)) return -ENOMEM; /* @@ -105,7 +105,6 @@ static int acpi_processor_update_tsd_coord(void) if (retval) goto err_ret; - cpumask_clear(covered_cpus); for_each_possible_cpu(i) { pr = per_cpu(processors, i); if (!pr) diff --git a/drivers/net/sfc/efx.c b/drivers/net/sfc/efx.c index 07a7e4b8f8fc..cc4b2f99989d 100644 --- a/drivers/net/sfc/efx.c +++ b/drivers/net/sfc/efx.c @@ -884,13 +884,12 @@ static int efx_wanted_rx_queues(void) int count; int cpu; - if (unlikely(!alloc_cpumask_var(&core_mask, GFP_KERNEL))) { + if (unlikely(!zalloc_cpumask_var(&core_mask, GFP_KERNEL))) { printk(KERN_WARNING "sfc: RSS disabled due to allocation failure\n"); return 1; } - cpumask_clear(core_mask); count = 0; for_each_online_cpu(cpu) { if (!cpumask_test_cpu(cpu, core_mask)) { diff --git a/drivers/oprofile/buffer_sync.c b/drivers/oprofile/buffer_sync.c index 8574622e36a5..c9e2ae90f195 100644 --- a/drivers/oprofile/buffer_sync.c +++ b/drivers/oprofile/buffer_sync.c @@ -154,9 +154,8 @@ int sync_start(void) { int err; - if (!alloc_cpumask_var(&marked_cpus, GFP_KERNEL)) + if (!zalloc_cpumask_var(&marked_cpus, GFP_KERNEL)) return -ENOMEM; - cpumask_clear(marked_cpus); start_cpu_work(); diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 6c0f6a8a22eb..411af37f4be4 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1984,11 +1984,9 @@ __tracing_open(struct inode *inode, struct file *file) if (current_trace) *iter->trace = *current_trace; - if (!alloc_cpumask_var(&iter->started, GFP_KERNEL)) + if (!zalloc_cpumask_var(&iter->started, GFP_KERNEL)) goto fail; - cpumask_clear(iter->started); - if (current_trace && current_trace->print_max) iter->tr = &max_tr; else @@ -4389,7 +4387,7 @@ __init static int tracer_alloc_buffers(void) if (!alloc_cpumask_var(&tracing_cpumask, GFP_KERNEL)) goto out_free_buffer_mask; - if (!alloc_cpumask_var(&tracing_reader_cpumask, GFP_KERNEL)) + if (!zalloc_cpumask_var(&tracing_reader_cpumask, GFP_KERNEL)) goto out_free_tracing_cpumask; /* To save memory, keep the ring buffer size to its minimum */ @@ -4400,7 +4398,6 @@ __init static int tracer_alloc_buffers(void) cpumask_copy(tracing_buffer_mask, cpu_possible_mask); cpumask_copy(tracing_cpumask, cpu_all_mask); - cpumask_clear(tracing_reader_cpumask); /* TODO: make the number of buffers hot pluggable with CPUS */ global_trace.buffer = ring_buffer_alloc(ring_buf_size, diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 897bff3b7df9..034a798b0431 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -738,8 +738,7 @@ static bool make_all_cpus_request(struct kvm *kvm, unsigned int req) bool called = true; struct kvm_vcpu *vcpu; - if (alloc_cpumask_var(&cpus, GFP_ATOMIC)) - cpumask_clear(cpus); + zalloc_cpumask_var(&cpus, GFP_ATOMIC); spin_lock(&kvm->requests_lock); me = smp_processor_id(); -- cgit v1.2.3 From 1d1afc1957a441fc75a27517b17437d8af3b3b93 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Thu, 24 Sep 2009 09:34:36 -0600 Subject: cpumask: remove last assignment to mask field of struct irqaction. This snuck in after the patch which removed all the others. Signed-off-by: Rusty Russell Cc: Ingo Molnar --- arch/x86/kernel/time.c | 1 - 1 file changed, 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/time.c b/arch/x86/kernel/time.c index e293ac56c723..dcb00d278512 100644 --- a/arch/x86/kernel/time.c +++ b/arch/x86/kernel/time.c @@ -93,7 +93,6 @@ static struct irqaction irq0 = { void __init setup_default_timer_irq(void) { - irq0.mask = cpumask_of_cpu(0); setup_irq(0, &irq0); } -- cgit v1.2.3 From 78f1c4d6b027993763a5aba83873b0462d06db8f Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Thu, 24 Sep 2009 09:34:51 -0600 Subject: cpumask: use mm_cpumask() wrapper: x86 Makes code futureproof against the impending change to mm->cpu_vm_mask (to be a pointer). It's also a chance to use the new cpumask_ ops which take a pointer (the older ones are deprecated, but there's no hurry for arch code). Signed-off-by: Rusty Russell --- arch/x86/include/asm/mmu_context.h | 6 +++--- arch/x86/kernel/ldt.c | 4 ++-- arch/x86/mm/tlb.c | 15 ++++++++------- arch/x86/xen/mmu.c | 4 ++-- 4 files changed, 15 insertions(+), 14 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h index f923203dc39a..4a2d4e0c18d9 100644 --- a/arch/x86/include/asm/mmu_context.h +++ b/arch/x86/include/asm/mmu_context.h @@ -37,12 +37,12 @@ static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next, if (likely(prev != next)) { /* stop flush ipis for the previous mm */ - cpu_clear(cpu, prev->cpu_vm_mask); + cpumask_clear_cpu(cpu, mm_cpumask(prev)); #ifdef CONFIG_SMP percpu_write(cpu_tlbstate.state, TLBSTATE_OK); percpu_write(cpu_tlbstate.active_mm, next); #endif - cpu_set(cpu, next->cpu_vm_mask); + cpumask_set_cpu(cpu, mm_cpumask(next)); /* Re-load page tables */ load_cr3(next->pgd); @@ -58,7 +58,7 @@ static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next, percpu_write(cpu_tlbstate.state, TLBSTATE_OK); BUG_ON(percpu_read(cpu_tlbstate.active_mm) != next); - if (!cpu_test_and_set(cpu, next->cpu_vm_mask)) { + if (!cpumask_test_and_set_cpu(cpu, mm_cpumask(next))) { /* We were in lazy tlb mode and leave_mm disabled * tlb flush IPI delivery. We must reload CR3 * to make sure to use no freed page tables. diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c index 71f1d99a635d..ec6ef60cbd17 100644 --- a/arch/x86/kernel/ldt.c +++ b/arch/x86/kernel/ldt.c @@ -67,8 +67,8 @@ static int alloc_ldt(mm_context_t *pc, int mincount, int reload) #ifdef CONFIG_SMP preempt_disable(); load_LDT(pc); - if (!cpus_equal(current->mm->cpu_vm_mask, - cpumask_of_cpu(smp_processor_id()))) + if (!cpumask_equal(mm_cpumask(current->mm), + cpumask_of(smp_processor_id()))) smp_call_function(flush_ldt, current->mm, 1); preempt_enable(); #else diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index c814e144a3f0..36fe08eeb5c3 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -59,7 +59,8 @@ void leave_mm(int cpu) { if (percpu_read(cpu_tlbstate.state) == TLBSTATE_OK) BUG(); - cpu_clear(cpu, percpu_read(cpu_tlbstate.active_mm)->cpu_vm_mask); + cpumask_clear_cpu(cpu, + mm_cpumask(percpu_read(cpu_tlbstate.active_mm))); load_cr3(swapper_pg_dir); } EXPORT_SYMBOL_GPL(leave_mm); @@ -234,8 +235,8 @@ void flush_tlb_current_task(void) preempt_disable(); local_flush_tlb(); - if (cpumask_any_but(&mm->cpu_vm_mask, smp_processor_id()) < nr_cpu_ids) - flush_tlb_others(&mm->cpu_vm_mask, mm, TLB_FLUSH_ALL); + if (cpumask_any_but(mm_cpumask(mm), smp_processor_id()) < nr_cpu_ids) + flush_tlb_others(mm_cpumask(mm), mm, TLB_FLUSH_ALL); preempt_enable(); } @@ -249,8 +250,8 @@ void flush_tlb_mm(struct mm_struct *mm) else leave_mm(smp_processor_id()); } - if (cpumask_any_but(&mm->cpu_vm_mask, smp_processor_id()) < nr_cpu_ids) - flush_tlb_others(&mm->cpu_vm_mask, mm, TLB_FLUSH_ALL); + if (cpumask_any_but(mm_cpumask(mm), smp_processor_id()) < nr_cpu_ids) + flush_tlb_others(mm_cpumask(mm), mm, TLB_FLUSH_ALL); preempt_enable(); } @@ -268,8 +269,8 @@ void flush_tlb_page(struct vm_area_struct *vma, unsigned long va) leave_mm(smp_processor_id()); } - if (cpumask_any_but(&mm->cpu_vm_mask, smp_processor_id()) < nr_cpu_ids) - flush_tlb_others(&mm->cpu_vm_mask, mm, va); + if (cpumask_any_but(mm_cpumask(mm), smp_processor_id()) < nr_cpu_ids) + flush_tlb_others(mm_cpumask(mm), mm, va); preempt_enable(); } diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index 093dd59b5385..3bf7b1d250ce 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -1165,14 +1165,14 @@ static void xen_drop_mm_ref(struct mm_struct *mm) /* Get the "official" set of cpus referring to our pagetable. */ if (!alloc_cpumask_var(&mask, GFP_ATOMIC)) { for_each_online_cpu(cpu) { - if (!cpumask_test_cpu(cpu, &mm->cpu_vm_mask) + if (!cpumask_test_cpu(cpu, mm_cpumask(mm)) && per_cpu(xen_current_cr3, cpu) != __pa(mm->pgd)) continue; smp_call_function_single(cpu, drop_other_mm_ref, mm, 1); } return; } - cpumask_copy(mask, &mm->cpu_vm_mask); + cpumask_copy(mask, mm_cpumask(mm)); /* It's possible that a vcpu may have a stale reference to our cr3, because its in lazy mode, and it hasn't yet flushed -- cgit v1.2.3 From 2bcd57ab61e7cabed626226a3771617981c11ce1 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Thu, 24 Sep 2009 04:22:25 +0400 Subject: headers: utsname.h redux * remove asm/atomic.h inclusion from linux/utsname.h -- not needed after kref conversion * remove linux/utsname.h inclusion from files which do not need it NOTE: it looks like fs/binfmt_elf.c do not need utsname.h, however due to some personality stuff it _is_ needed -- cowardly leave ELF-related headers and files alone. Signed-off-by: Alexey Dobriyan Signed-off-by: Linus Torvalds --- arch/alpha/kernel/process.c | 1 - arch/arm/kernel/sys_arm.c | 1 - arch/frv/kernel/sys_frv.c | 1 - arch/h8300/kernel/sys_h8300.c | 1 - arch/m68k/kernel/sys_m68k.c | 1 - arch/m68knommu/kernel/sys_m68k.c | 1 - arch/microblaze/kernel/sys_microblaze.c | 1 - arch/mn10300/kernel/sys_mn10300.c | 1 - arch/parisc/kernel/sys_parisc32.c | 1 - arch/powerpc/kernel/setup-common.c | 1 - arch/powerpc/kernel/sys_ppc32.c | 1 - arch/s390/kernel/compat_linux.c | 1 - arch/s390/kernel/process.c | 1 - arch/sh/kernel/sys_sh32.c | 1 - arch/sh/kernel/sys_sh64.c | 1 - arch/sparc/kernel/sys_sparc32.c | 1 - arch/sparc/kernel/systbls.h | 3 ++- arch/x86/kernel/dumpstack_32.c | 1 - arch/x86/kernel/dumpstack_64.c | 1 - arch/x86/kernel/traps.c | 1 - drivers/media/video/usbvision/usbvision-core.c | 1 - drivers/media/video/usbvision/usbvision-i2c.c | 1 - drivers/media/video/usbvision/usbvision-video.c | 1 - drivers/s390/char/zcore.c | 1 - drivers/usb/gadget/f_loopback.c | 1 - drivers/usb/gadget/f_obex.c | 1 - drivers/usb/gadget/f_sourcesink.c | 1 - drivers/usb/gadget/u_audio.c | 1 - drivers/usb/gadget/u_ether.c | 1 - fs/gfs2/ops_inode.c | 1 - fs/lockd/xdr.c | 1 - fs/lockd/xdr4.c | 1 - fs/nfs/nfs2xdr.c | 1 - fs/nfs/nfs3proc.c | 1 - fs/nfs/nfs3xdr.c | 1 - fs/nfs/nfs4proc.c | 1 - fs/nfs/nfs4xdr.c | 1 - fs/nfs/proc.c | 1 - fs/nfsd/nfs4idmap.c | 1 - fs/ocfs2/dlm/dlmast.c | 1 - fs/ocfs2/dlm/dlmconvert.c | 1 - fs/ocfs2/dlm/dlmdebug.c | 1 - fs/ocfs2/dlm/dlmdomain.c | 1 - fs/ocfs2/dlm/dlmlock.c | 1 - fs/ocfs2/dlm/dlmmaster.c | 1 - fs/ocfs2/dlm/dlmrecovery.c | 1 - fs/ocfs2/dlm/dlmthread.c | 1 - fs/ocfs2/dlm/dlmunlock.c | 1 - fs/ocfs2/super.c | 1 - fs/ocfs2/symlink.c | 1 - include/linux/utsname.h | 1 - init/main.c | 1 - kernel/power/swap.c | 1 - kernel/sysctl.c | 1 - kernel/uid16.c | 1 - net/sunrpc/auth_null.c | 1 - 56 files changed, 2 insertions(+), 56 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/alpha/kernel/process.c b/arch/alpha/kernel/process.c index 3a2fb7a02db4..289039bb6bb2 100644 --- a/arch/alpha/kernel/process.c +++ b/arch/alpha/kernel/process.c @@ -19,7 +19,6 @@ #include #include #include -#include #include #include #include diff --git a/arch/arm/kernel/sys_arm.c b/arch/arm/kernel/sys_arm.c index b3ec641b5cf8..78ecaac65206 100644 --- a/arch/arm/kernel/sys_arm.c +++ b/arch/arm/kernel/sys_arm.c @@ -25,7 +25,6 @@ #include #include #include -#include #include #include diff --git a/arch/frv/kernel/sys_frv.c b/arch/frv/kernel/sys_frv.c index baadc97f8627..2b6b5289cdcc 100644 --- a/arch/frv/kernel/sys_frv.c +++ b/arch/frv/kernel/sys_frv.c @@ -21,7 +21,6 @@ #include #include #include -#include #include #include diff --git a/arch/h8300/kernel/sys_h8300.c b/arch/h8300/kernel/sys_h8300.c index 2745656dcc52..8cb5d73a0e35 100644 --- a/arch/h8300/kernel/sys_h8300.c +++ b/arch/h8300/kernel/sys_h8300.c @@ -17,7 +17,6 @@ #include #include #include -#include #include #include diff --git a/arch/m68k/kernel/sys_m68k.c b/arch/m68k/kernel/sys_m68k.c index 7f54efaf60bb..7deb402bfc75 100644 --- a/arch/m68k/kernel/sys_m68k.c +++ b/arch/m68k/kernel/sys_m68k.c @@ -20,7 +20,6 @@ #include #include #include -#include #include #include diff --git a/arch/m68knommu/kernel/sys_m68k.c b/arch/m68knommu/kernel/sys_m68k.c index 700281638629..efdd090778a3 100644 --- a/arch/m68knommu/kernel/sys_m68k.c +++ b/arch/m68knommu/kernel/sys_m68k.c @@ -17,7 +17,6 @@ #include #include #include -#include #include #include diff --git a/arch/microblaze/kernel/sys_microblaze.c b/arch/microblaze/kernel/sys_microblaze.c index b96f1682bb24..07cabed4b947 100644 --- a/arch/microblaze/kernel/sys_microblaze.c +++ b/arch/microblaze/kernel/sys_microblaze.c @@ -23,7 +23,6 @@ #include #include #include -#include #include #include #include diff --git a/arch/mn10300/kernel/sys_mn10300.c b/arch/mn10300/kernel/sys_mn10300.c index 3e52a1054327..8ca5af00334c 100644 --- a/arch/mn10300/kernel/sys_mn10300.c +++ b/arch/mn10300/kernel/sys_mn10300.c @@ -19,7 +19,6 @@ #include #include #include -#include #include #include diff --git a/arch/parisc/kernel/sys_parisc32.c b/arch/parisc/kernel/sys_parisc32.c index 92a0acaa0d12..561388b17c91 100644 --- a/arch/parisc/kernel/sys_parisc32.c +++ b/arch/parisc/kernel/sys_parisc32.c @@ -18,7 +18,6 @@ #include #include #include -#include #include #include #include diff --git a/arch/powerpc/kernel/setup-common.c b/arch/powerpc/kernel/setup-common.c index 1d5570a1e456..74cd1a7d0d4b 100644 --- a/arch/powerpc/kernel/setup-common.c +++ b/arch/powerpc/kernel/setup-common.c @@ -24,7 +24,6 @@ #include #include #include -#include #include #include #include diff --git a/arch/powerpc/kernel/sys_ppc32.c b/arch/powerpc/kernel/sys_ppc32.c index 1cc5e9e5da96..b97c2d67f4ac 100644 --- a/arch/powerpc/kernel/sys_ppc32.c +++ b/arch/powerpc/kernel/sys_ppc32.c @@ -22,7 +22,6 @@ #include #include #include -#include #include #include #include diff --git a/arch/s390/kernel/compat_linux.c b/arch/s390/kernel/compat_linux.c index 5519cb745106..0debcec23a39 100644 --- a/arch/s390/kernel/compat_linux.c +++ b/arch/s390/kernel/compat_linux.c @@ -24,7 +24,6 @@ #include #include #include -#include #include #include #include diff --git a/arch/s390/kernel/process.c b/arch/s390/kernel/process.c index 59fe6ecc6ed3..5417eb57271a 100644 --- a/arch/s390/kernel/process.c +++ b/arch/s390/kernel/process.c @@ -27,7 +27,6 @@ #include #include #include -#include #include #include #include diff --git a/arch/sh/kernel/sys_sh32.c b/arch/sh/kernel/sys_sh32.c index 63ba12836eae..eb68bfdd86e6 100644 --- a/arch/sh/kernel/sys_sh32.c +++ b/arch/sh/kernel/sys_sh32.c @@ -9,7 +9,6 @@ #include #include #include -#include #include #include #include diff --git a/arch/sh/kernel/sys_sh64.c b/arch/sh/kernel/sys_sh64.c index 91fb8445a5a0..287235768bc5 100644 --- a/arch/sh/kernel/sys_sh64.c +++ b/arch/sh/kernel/sys_sh64.c @@ -23,7 +23,6 @@ #include #include #include -#include #include #include #include diff --git a/arch/sparc/kernel/sys_sparc32.c b/arch/sparc/kernel/sys_sparc32.c index f5000a460c05..04e28b2671c8 100644 --- a/arch/sparc/kernel/sys_sparc32.c +++ b/arch/sparc/kernel/sys_sparc32.c @@ -16,7 +16,6 @@ #include #include #include -#include #include #include #include diff --git a/arch/sparc/kernel/systbls.h b/arch/sparc/kernel/systbls.h index 15c2d752b2bc..a63c5d2d9849 100644 --- a/arch/sparc/kernel/systbls.h +++ b/arch/sparc/kernel/systbls.h @@ -3,10 +3,11 @@ #include #include -#include #include #include +struct new_utsname; + extern asmlinkage unsigned long sys_getpagesize(void); extern asmlinkage unsigned long sparc_brk(unsigned long brk); extern asmlinkage long sparc_pipe(struct pt_regs *regs); diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c index bca5fba91c9e..f7dd2a7c3bf4 100644 --- a/arch/x86/kernel/dumpstack_32.c +++ b/arch/x86/kernel/dumpstack_32.c @@ -5,7 +5,6 @@ #include #include #include -#include #include #include #include diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c index 54b0a3276766..a071e6be177e 100644 --- a/arch/x86/kernel/dumpstack_64.c +++ b/arch/x86/kernel/dumpstack_64.c @@ -5,7 +5,6 @@ #include #include #include -#include #include #include #include diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 9346e102338d..a665c71352b8 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -14,7 +14,6 @@ #include #include #include -#include #include #include #include diff --git a/drivers/media/video/usbvision/usbvision-core.c b/drivers/media/video/usbvision/usbvision-core.c index 6ba16abeebdd..e0f91e4ab653 100644 --- a/drivers/media/video/usbvision/usbvision-core.c +++ b/drivers/media/video/usbvision/usbvision-core.c @@ -28,7 +28,6 @@ #include #include #include -#include #include #include #include diff --git a/drivers/media/video/usbvision/usbvision-i2c.c b/drivers/media/video/usbvision/usbvision-i2c.c index f97fd06d5948..c19f51dba2ee 100644 --- a/drivers/media/video/usbvision/usbvision-i2c.c +++ b/drivers/media/video/usbvision/usbvision-i2c.c @@ -28,7 +28,6 @@ #include #include #include -#include #include #include #include diff --git a/drivers/media/video/usbvision/usbvision-video.c b/drivers/media/video/usbvision/usbvision-video.c index 90d9b5c0e9a7..a2a50d608a3f 100644 --- a/drivers/media/video/usbvision/usbvision-video.c +++ b/drivers/media/video/usbvision/usbvision-video.c @@ -52,7 +52,6 @@ #include #include #include -#include #include #include #include diff --git a/drivers/s390/char/zcore.c b/drivers/s390/char/zcore.c index c431198bdbc4..82daa3c1dc9c 100644 --- a/drivers/s390/char/zcore.c +++ b/drivers/s390/char/zcore.c @@ -14,7 +14,6 @@ #include #include -#include #include #include #include diff --git a/drivers/usb/gadget/f_loopback.c b/drivers/usb/gadget/f_loopback.c index eb6ddfc20857..6cb29d3df575 100644 --- a/drivers/usb/gadget/f_loopback.c +++ b/drivers/usb/gadget/f_loopback.c @@ -22,7 +22,6 @@ /* #define VERBOSE_DEBUG */ #include -#include #include #include "g_zero.h" diff --git a/drivers/usb/gadget/f_obex.c b/drivers/usb/gadget/f_obex.c index 46d6266f30ec..b4a3ba654ea5 100644 --- a/drivers/usb/gadget/f_obex.c +++ b/drivers/usb/gadget/f_obex.c @@ -24,7 +24,6 @@ /* #define VERBOSE_DEBUG */ #include -#include #include #include "u_serial.h" diff --git a/drivers/usb/gadget/f_sourcesink.c b/drivers/usb/gadget/f_sourcesink.c index bffe91d525f9..09cba273d2db 100644 --- a/drivers/usb/gadget/f_sourcesink.c +++ b/drivers/usb/gadget/f_sourcesink.c @@ -22,7 +22,6 @@ /* #define VERBOSE_DEBUG */ #include -#include #include #include "g_zero.h" diff --git a/drivers/usb/gadget/u_audio.c b/drivers/usb/gadget/u_audio.c index b5200d551458..8252595d619d 100644 --- a/drivers/usb/gadget/u_audio.c +++ b/drivers/usb/gadget/u_audio.c @@ -10,7 +10,6 @@ */ #include -#include #include #include #include diff --git a/drivers/usb/gadget/u_ether.c b/drivers/usb/gadget/u_ether.c index f8751ff863cd..2fc02bd95848 100644 --- a/drivers/usb/gadget/u_ether.c +++ b/drivers/usb/gadget/u_ether.c @@ -23,7 +23,6 @@ /* #define VERBOSE_DEBUG */ #include -#include #include #include #include diff --git a/fs/gfs2/ops_inode.c b/fs/gfs2/ops_inode.c index c3ac18054057..247436c10deb 100644 --- a/fs/gfs2/ops_inode.c +++ b/fs/gfs2/ops_inode.c @@ -12,7 +12,6 @@ #include #include #include -#include #include #include #include diff --git a/fs/lockd/xdr.c b/fs/lockd/xdr.c index 0336f2beacde..b583ab0a4cbb 100644 --- a/fs/lockd/xdr.c +++ b/fs/lockd/xdr.c @@ -8,7 +8,6 @@ #include #include -#include #include #include diff --git a/fs/lockd/xdr4.c b/fs/lockd/xdr4.c index e1d528653192..ad9dbbc9145d 100644 --- a/fs/lockd/xdr4.c +++ b/fs/lockd/xdr4.c @@ -9,7 +9,6 @@ #include #include -#include #include #include diff --git a/fs/nfs/nfs2xdr.c b/fs/nfs/nfs2xdr.c index c862c9340f9a..5e078b222b4e 100644 --- a/fs/nfs/nfs2xdr.c +++ b/fs/nfs/nfs2xdr.c @@ -13,7 +13,6 @@ #include #include #include -#include #include #include #include diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c index ee6a13f05443..3f8881d1a050 100644 --- a/fs/nfs/nfs3proc.c +++ b/fs/nfs/nfs3proc.c @@ -7,7 +7,6 @@ */ #include -#include #include #include #include diff --git a/fs/nfs/nfs3xdr.c b/fs/nfs/nfs3xdr.c index 35869a4921f1..5fe5492fbd29 100644 --- a/fs/nfs/nfs3xdr.c +++ b/fs/nfs/nfs3xdr.c @@ -10,7 +10,6 @@ #include #include #include -#include #include #include #include diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index be6544aef41f..ed7c269e2514 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -36,7 +36,6 @@ */ #include -#include #include #include #include diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c index cfc30d362f94..83ad47cbdd8a 100644 --- a/fs/nfs/nfs4xdr.c +++ b/fs/nfs/nfs4xdr.c @@ -39,7 +39,6 @@ #include #include #include -#include #include #include #include diff --git a/fs/nfs/proc.c b/fs/nfs/proc.c index 7be72d90d49d..ef583854d8d0 100644 --- a/fs/nfs/proc.c +++ b/fs/nfs/proc.c @@ -32,7 +32,6 @@ #include #include #include -#include #include #include #include diff --git a/fs/nfsd/nfs4idmap.c b/fs/nfsd/nfs4idmap.c index cdfa86fa1471..ba2c199592fd 100644 --- a/fs/nfsd/nfs4idmap.c +++ b/fs/nfsd/nfs4idmap.c @@ -38,7 +38,6 @@ #include #include -#include #include #include #include diff --git a/fs/ocfs2/dlm/dlmast.c b/fs/ocfs2/dlm/dlmast.c index 81eff8e58322..01cf8cc3d286 100644 --- a/fs/ocfs2/dlm/dlmast.c +++ b/fs/ocfs2/dlm/dlmast.c @@ -30,7 +30,6 @@ #include #include #include -#include #include #include #include diff --git a/fs/ocfs2/dlm/dlmconvert.c b/fs/ocfs2/dlm/dlmconvert.c index 75997b4deaf3..ca96bce50e18 100644 --- a/fs/ocfs2/dlm/dlmconvert.c +++ b/fs/ocfs2/dlm/dlmconvert.c @@ -30,7 +30,6 @@ #include #include #include -#include #include #include #include diff --git a/fs/ocfs2/dlm/dlmdebug.c b/fs/ocfs2/dlm/dlmdebug.c index c5c88124096d..ca46002ec10e 100644 --- a/fs/ocfs2/dlm/dlmdebug.c +++ b/fs/ocfs2/dlm/dlmdebug.c @@ -27,7 +27,6 @@ #include #include #include -#include #include #include #include diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c index 4d9e6b288dd8..0334000676d3 100644 --- a/fs/ocfs2/dlm/dlmdomain.c +++ b/fs/ocfs2/dlm/dlmdomain.c @@ -28,7 +28,6 @@ #include #include #include -#include #include #include #include diff --git a/fs/ocfs2/dlm/dlmlock.c b/fs/ocfs2/dlm/dlmlock.c index 83a9f2972ac8..437698e9465f 100644 --- a/fs/ocfs2/dlm/dlmlock.c +++ b/fs/ocfs2/dlm/dlmlock.c @@ -30,7 +30,6 @@ #include #include #include -#include #include #include #include diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c index f8b653fcd4dd..83bcaf266b35 100644 --- a/fs/ocfs2/dlm/dlmmaster.c +++ b/fs/ocfs2/dlm/dlmmaster.c @@ -30,7 +30,6 @@ #include #include #include -#include #include #include #include diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c index 43e6e3280569..d9fa3d22e17c 100644 --- a/fs/ocfs2/dlm/dlmrecovery.c +++ b/fs/ocfs2/dlm/dlmrecovery.c @@ -30,7 +30,6 @@ #include #include #include -#include #include #include #include diff --git a/fs/ocfs2/dlm/dlmthread.c b/fs/ocfs2/dlm/dlmthread.c index 98569e86c613..52ec020ea78b 100644 --- a/fs/ocfs2/dlm/dlmthread.c +++ b/fs/ocfs2/dlm/dlmthread.c @@ -30,7 +30,6 @@ #include #include #include -#include #include #include #include diff --git a/fs/ocfs2/dlm/dlmunlock.c b/fs/ocfs2/dlm/dlmunlock.c index 756f5b0998e0..00f53b2aea76 100644 --- a/fs/ocfs2/dlm/dlmunlock.c +++ b/fs/ocfs2/dlm/dlmunlock.c @@ -30,7 +30,6 @@ #include #include #include -#include #include #include #include diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index 24feb449a1dc..4cc3c890a2cd 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c @@ -28,7 +28,6 @@ #include #include #include -#include #include #include #include diff --git a/fs/ocfs2/symlink.c b/fs/ocfs2/symlink.c index 579dd1b1110f..e3421030a69f 100644 --- a/fs/ocfs2/symlink.c +++ b/fs/ocfs2/symlink.c @@ -38,7 +38,6 @@ #include #include #include -#include #include #define MLOG_MASK_PREFIX ML_NAMEI diff --git a/include/linux/utsname.h b/include/linux/utsname.h index 3656b300de3a..69f39974c041 100644 --- a/include/linux/utsname.h +++ b/include/linux/utsname.h @@ -36,7 +36,6 @@ struct new_utsname { #include #include #include -#include struct uts_namespace { struct kref kref; diff --git a/init/main.c b/init/main.c index 6107223124e4..76961db298f0 100644 --- a/init/main.c +++ b/init/main.c @@ -18,7 +18,6 @@ #include #include #include -#include #include #include #include diff --git a/kernel/power/swap.c b/kernel/power/swap.c index 8ba052c86d48..b101cdc4df3f 100644 --- a/kernel/power/swap.c +++ b/kernel/power/swap.c @@ -13,7 +13,6 @@ #include #include -#include #include #include #include diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 0dfaa47d7cb6..7f4f57bea4ce 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -26,7 +26,6 @@ #include #include #include -#include #include #include #include diff --git a/kernel/uid16.c b/kernel/uid16.c index 0314501688b9..419209893d87 100644 --- a/kernel/uid16.c +++ b/kernel/uid16.c @@ -4,7 +4,6 @@ */ #include -#include #include #include #include diff --git a/net/sunrpc/auth_null.c b/net/sunrpc/auth_null.c index c70dd7f5258e..1db618f56ecb 100644 --- a/net/sunrpc/auth_null.c +++ b/net/sunrpc/auth_null.c @@ -8,7 +8,6 @@ #include #include -#include #include #ifdef RPC_DEBUG -- cgit v1.2.3 From ea01c0d7315d6e3218fd22a6947c5b09305fcf65 Mon Sep 17 00:00:00 2001 From: Roland Dreier Date: Wed, 23 Sep 2009 15:33:23 -0700 Subject: x86: Reduce verbosity of "TSC is reliable" message On modern systems, the kernel prints the message Skipping synchronization checks as TSC is reliable. once for every non-boot CPU. This gets kind of ridiculous on huge systems; for example, on a 64-thread system I was lucky enough to get: $ dmesg | grep 'TSC is reliable' | wc 63 567 4221 There's no point to doing this for every CPU, since the code is just checking the boot CPU anyway, so change this to a printk_once() to make the message appears only once. Signed-off-by: Roland Dreier LKML-Reference: Signed-off-by: Ingo Molnar --- arch/x86/kernel/tsc_sync.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/tsc_sync.c b/arch/x86/kernel/tsc_sync.c index 027b5b498993..f37930954d15 100644 --- a/arch/x86/kernel/tsc_sync.c +++ b/arch/x86/kernel/tsc_sync.c @@ -114,7 +114,7 @@ void __cpuinit check_tsc_sync_source(int cpu) return; if (boot_cpu_has(X86_FEATURE_TSC_RELIABLE)) { - pr_info("Skipping synchronization checks as TSC is reliable.\n"); + printk_once(KERN_INFO "Skipping synchronization checks as TSC is reliable.\n"); return; } -- cgit v1.2.3 From 429a6e5e2c1fbe5d805aad123efbdb5f0c14769f Mon Sep 17 00:00:00 2001 From: Jason Wessel Date: Wed, 23 Sep 2009 18:13:13 -0500 Subject: x86: early_printk: Protect against using the same device twice If you use the kernel argument: earlyprintk=serial,ttyS0,115200 This will cause a recursive hang printing the same line again and again: BIOS-e820: 000000003fff3000 - 0000000040000000 (ACPI data) BIOS-e820: 00000000e0000000 - 00000000f0000000 (reserved) BIOS-e820: 00000000fec00000 - 0000000100000000 (reserved) bootconsole [earlyser0] enabled Linux version 2.6.31-07863-gb64ada6 (mingo@sirius) (gcc version 4.3.2 20081105 (Red Hat 4.3.2-7) (GCC) ) #16789 SMP Wed Sep 23 21:09:43 CEST 2009 Linux version 2.6.31-07863-gb64ada6 (mingo@sirius) (gcc version 4.3.2 20081105 (Red Hat 4.3.2-7) (GCC) ) #16789 SMP Wed Sep 23 21:09:43 CEST 2009 Linux version 2.6.31-07863-gb64ada6 (mingo@sirius) (gcc version 4.3.2 20081105 (Red Hat 4.3.2-7) (GCC) ) #16789 SMP Wed Sep 23 21:09:43 CEST 2009 Linux version 2.6.31-07863-gb64ada6 (mingo@sirius) (gcc version 4.3.2 20081105 (Red Hat 4.3.2-7) (GCC) ) #16789 SMP Wed Sep 23 21:09:43 CEST 2009 Linux version 2.6.31-07863-gb64ada6 (mingo@sirius) (gcc version 4.3.2 20081105 (Red Hat 4.3.2-7) (GCC) ) #16789 SMP Wed Sep 23 21:09:43 CEST 2009 Instead warn the end user that they specified the device a second time, and ignore that second console. Reported-by: Ingo Molnar Signed-off-by: Jason Wessel Cc: Len Brown Cc: Greg KH Cc: Andrew Morton Cc: Linus Torvalds LKML-Reference: <4ABAAB89.1080407@windriver.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/early_printk.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/early_printk.c b/arch/x86/kernel/early_printk.c index 2acfd3fdc0cc..41fd965c80c6 100644 --- a/arch/x86/kernel/early_printk.c +++ b/arch/x86/kernel/early_printk.c @@ -178,6 +178,11 @@ asmlinkage void early_printk(const char *fmt, ...) static inline void early_console_register(struct console *con, int keep_early) { + if (early_console->index != -1) { + printk(KERN_CRIT "ERROR: earlyprintk= %s already used\n", + con->name); + return; + } early_console = con; if (keep_early) early_console->flags &= ~CON_BOOT; -- cgit v1.2.3 From 8d65af789f3e2cf4cfbdbf71a0f7a61ebcd41d38 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Wed, 23 Sep 2009 15:57:19 -0700 Subject: sysctl: remove "struct file *" argument of ->proc_handler It's unused. It isn't needed -- read or write flag is already passed and sysctl shouldn't care about the rest. It _was_ used in two places at arch/frv for some reason. Signed-off-by: Alexey Dobriyan Cc: David Howells Cc: "Eric W. Biederman" Cc: Al Viro Cc: Ralf Baechle Cc: Martin Schwidefsky Cc: Ingo Molnar Cc: "David S. Miller" Cc: James Morris Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/frv/kernel/pm.c | 14 +++---- arch/mips/lasat/sysctl.c | 18 ++++----- arch/s390/appldata/appldata_base.c | 9 ++--- arch/s390/kernel/debug.c | 4 +- arch/s390/mm/cmm.c | 4 +- arch/x86/include/asm/nmi.h | 3 +- arch/x86/kernel/apic/nmi.c | 4 +- arch/x86/kernel/vsyscall_64.c | 10 +---- drivers/cdrom/cdrom.c | 8 ++-- drivers/char/random.c | 4 +- drivers/net/wireless/arlan-proc.c | 28 +++++++------- drivers/parport/procfs.c | 12 +++--- fs/coda/coda_int.h | 1 + fs/drop_caches.c | 4 +- fs/file_table.c | 6 +-- fs/proc/proc_sysctl.c | 2 +- fs/xfs/linux-2.6/xfs_sysctl.c | 3 +- include/linux/fs.h | 2 +- include/linux/ftrace.h | 4 +- include/linux/hugetlb.h | 6 +-- include/linux/mm.h | 2 +- include/linux/mmzone.h | 13 +++---- include/linux/sched.h | 8 ++-- include/linux/security.h | 2 +- include/linux/swap.h | 2 +- include/linux/sysctl.h | 19 +++++----- include/linux/writeback.h | 11 +++--- include/net/ip.h | 2 +- include/net/ndisc.h | 2 - ipc/ipc_sysctl.c | 16 ++++---- ipc/mq_sysctl.c | 8 ++-- kernel/hung_task.c | 4 +- kernel/sched.c | 4 +- kernel/sched_fair.c | 4 +- kernel/slow-work.c | 12 +++--- kernel/softlockup.c | 4 +- kernel/sysctl.c | 78 ++++++++++++++++---------------------- kernel/trace/ftrace.c | 4 +- kernel/trace/trace_stack.c | 4 +- kernel/utsname_sysctl.c | 4 +- mm/hugetlb.c | 12 +++--- mm/page-writeback.c | 20 +++++----- mm/page_alloc.c | 24 ++++++------ mm/vmscan.c | 4 +- net/bridge/br_netfilter.c | 4 +- net/decnet/dn_dev.c | 5 +-- net/decnet/sysctl_net_decnet.c | 2 - net/ipv4/devinet.c | 12 +++--- net/ipv4/route.c | 7 ++-- net/ipv4/sysctl_net_ipv4.c | 16 ++++---- net/ipv6/addrconf.c | 8 ++-- net/ipv6/ndisc.c | 8 ++-- net/ipv6/route.c | 4 +- net/irda/irsysctl.c | 8 ++-- net/netfilter/ipvs/ip_vs_ctl.c | 8 ++-- net/netfilter/nf_log.c | 4 +- net/phonet/sysctl.c | 4 +- net/sunrpc/sysctl.c | 4 +- net/sunrpc/xprtrdma/svc_rdma.c | 2 +- security/min_addr.c | 4 +- 60 files changed, 239 insertions(+), 270 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/frv/kernel/pm.c b/arch/frv/kernel/pm.c index be722fc1acff..0d4d3e3a4cfc 100644 --- a/arch/frv/kernel/pm.c +++ b/arch/frv/kernel/pm.c @@ -150,7 +150,7 @@ static int user_atoi(char __user *ubuf, size_t len) /* * Send us to sleep. */ -static int sysctl_pm_do_suspend(ctl_table *ctl, int write, struct file *filp, +static int sysctl_pm_do_suspend(ctl_table *ctl, int write, void __user *buffer, size_t *lenp, loff_t *fpos) { int retval, mode; @@ -198,13 +198,13 @@ static int try_set_cmode(int new_cmode) } -static int cmode_procctl(ctl_table *ctl, int write, struct file *filp, +static int cmode_procctl(ctl_table *ctl, int write, void __user *buffer, size_t *lenp, loff_t *fpos) { int new_cmode; if (!write) - return proc_dointvec(ctl, write, filp, buffer, lenp, fpos); + return proc_dointvec(ctl, write, buffer, lenp, fpos); new_cmode = user_atoi(buffer, *lenp); @@ -301,13 +301,13 @@ static int try_set_cm(int new_cm) return 0; } -static int p0_procctl(ctl_table *ctl, int write, struct file *filp, +static int p0_procctl(ctl_table *ctl, int write, void __user *buffer, size_t *lenp, loff_t *fpos) { int new_p0; if (!write) - return proc_dointvec(ctl, write, filp, buffer, lenp, fpos); + return proc_dointvec(ctl, write, buffer, lenp, fpos); new_p0 = user_atoi(buffer, *lenp); @@ -345,13 +345,13 @@ static int p0_sysctl(ctl_table *table, return 1; } -static int cm_procctl(ctl_table *ctl, int write, struct file *filp, +static int cm_procctl(ctl_table *ctl, int write, void __user *buffer, size_t *lenp, loff_t *fpos) { int new_cm; if (!write) - return proc_dointvec(ctl, write, filp, buffer, lenp, fpos); + return proc_dointvec(ctl, write, buffer, lenp, fpos); new_cm = user_atoi(buffer, *lenp); diff --git a/arch/mips/lasat/sysctl.c b/arch/mips/lasat/sysctl.c index 3f04d4c406b7..b3deed8db619 100644 --- a/arch/mips/lasat/sysctl.c +++ b/arch/mips/lasat/sysctl.c @@ -56,12 +56,12 @@ int sysctl_lasatstring(ctl_table *table, /* And the same for proc */ -int proc_dolasatstring(ctl_table *table, int write, struct file *filp, +int proc_dolasatstring(ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { int r; - r = proc_dostring(table, write, filp, buffer, lenp, ppos); + r = proc_dostring(table, write, buffer, lenp, ppos); if ((!write) || r) return r; @@ -71,12 +71,12 @@ int proc_dolasatstring(ctl_table *table, int write, struct file *filp, } /* proc function to write EEPROM after changing int entry */ -int proc_dolasatint(ctl_table *table, int write, struct file *filp, +int proc_dolasatint(ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { int r; - r = proc_dointvec(table, write, filp, buffer, lenp, ppos); + r = proc_dointvec(table, write, buffer, lenp, ppos); if ((!write) || r) return r; @@ -89,7 +89,7 @@ int proc_dolasatint(ctl_table *table, int write, struct file *filp, static int rtctmp; /* proc function to read/write RealTime Clock */ -int proc_dolasatrtc(ctl_table *table, int write, struct file *filp, +int proc_dolasatrtc(ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { struct timespec ts; @@ -102,7 +102,7 @@ int proc_dolasatrtc(ctl_table *table, int write, struct file *filp, if (rtctmp < 0) rtctmp = 0; } - r = proc_dointvec(table, write, filp, buffer, lenp, ppos); + r = proc_dointvec(table, write, buffer, lenp, ppos); if (r) return r; @@ -154,7 +154,7 @@ int sysctl_lasat_rtc(ctl_table *table, #endif #ifdef CONFIG_INET -int proc_lasat_ip(ctl_table *table, int write, struct file *filp, +int proc_lasat_ip(ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { unsigned int ip; @@ -231,12 +231,12 @@ static int sysctl_lasat_prid(ctl_table *table, return 0; } -int proc_lasat_prid(ctl_table *table, int write, struct file *filp, +int proc_lasat_prid(ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { int r; - r = proc_dointvec(table, write, filp, buffer, lenp, ppos); + r = proc_dointvec(table, write, buffer, lenp, ppos); if (r < 0) return r; if (write) { diff --git a/arch/s390/appldata/appldata_base.c b/arch/s390/appldata/appldata_base.c index 264528e4f58d..b55fd7ed1c31 100644 --- a/arch/s390/appldata/appldata_base.c +++ b/arch/s390/appldata/appldata_base.c @@ -50,10 +50,9 @@ static struct platform_device *appldata_pdev; * /proc entries (sysctl) */ static const char appldata_proc_name[APPLDATA_PROC_NAME_LENGTH] = "appldata"; -static int appldata_timer_handler(ctl_table *ctl, int write, struct file *filp, +static int appldata_timer_handler(ctl_table *ctl, int write, void __user *buffer, size_t *lenp, loff_t *ppos); static int appldata_interval_handler(ctl_table *ctl, int write, - struct file *filp, void __user *buffer, size_t *lenp, loff_t *ppos); @@ -247,7 +246,7 @@ __appldata_vtimer_setup(int cmd) * Start/Stop timer, show status of timer (0 = not active, 1 = active) */ static int -appldata_timer_handler(ctl_table *ctl, int write, struct file *filp, +appldata_timer_handler(ctl_table *ctl, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { int len; @@ -289,7 +288,7 @@ out: * current timer interval. */ static int -appldata_interval_handler(ctl_table *ctl, int write, struct file *filp, +appldata_interval_handler(ctl_table *ctl, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { int len, interval; @@ -335,7 +334,7 @@ out: * monitoring (0 = not in process, 1 = in process) */ static int -appldata_generic_handler(ctl_table *ctl, int write, struct file *filp, +appldata_generic_handler(ctl_table *ctl, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { struct appldata_ops *ops = NULL, *tmp_ops; diff --git a/arch/s390/kernel/debug.c b/arch/s390/kernel/debug.c index 4c512561687d..20f282c911c2 100644 --- a/arch/s390/kernel/debug.c +++ b/arch/s390/kernel/debug.c @@ -881,11 +881,11 @@ static int debug_active=1; * if debug_active is already off */ static int -s390dbf_procactive(ctl_table *table, int write, struct file *filp, +s390dbf_procactive(ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { if (!write || debug_stoppable || !debug_active) - return proc_dointvec(table, write, filp, buffer, lenp, ppos); + return proc_dointvec(table, write, buffer, lenp, ppos); else return 0; } diff --git a/arch/s390/mm/cmm.c b/arch/s390/mm/cmm.c index 413c240cbca7..b201135cc18c 100644 --- a/arch/s390/mm/cmm.c +++ b/arch/s390/mm/cmm.c @@ -262,7 +262,7 @@ cmm_skip_blanks(char *cp, char **endp) static struct ctl_table cmm_table[]; static int -cmm_pages_handler(ctl_table *ctl, int write, struct file *filp, +cmm_pages_handler(ctl_table *ctl, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { char buf[16], *p; @@ -303,7 +303,7 @@ cmm_pages_handler(ctl_table *ctl, int write, struct file *filp, } static int -cmm_timeout_handler(ctl_table *ctl, int write, struct file *filp, +cmm_timeout_handler(ctl_table *ctl, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { char buf[64], *p; diff --git a/arch/x86/include/asm/nmi.h b/arch/x86/include/asm/nmi.h index e63cf7d441e1..139d4c1a33a7 100644 --- a/arch/x86/include/asm/nmi.h +++ b/arch/x86/include/asm/nmi.h @@ -40,8 +40,7 @@ extern unsigned int nmi_watchdog; #define NMI_INVALID 3 struct ctl_table; -struct file; -extern int proc_nmi_enabled(struct ctl_table *, int , struct file *, +extern int proc_nmi_enabled(struct ctl_table *, int , void __user *, size_t *, loff_t *); extern int unknown_nmi_panic; diff --git a/arch/x86/kernel/apic/nmi.c b/arch/x86/kernel/apic/nmi.c index cb66a22d98ad..7ff61d6a188a 100644 --- a/arch/x86/kernel/apic/nmi.c +++ b/arch/x86/kernel/apic/nmi.c @@ -508,14 +508,14 @@ static int unknown_nmi_panic_callback(struct pt_regs *regs, int cpu) /* * proc handler for /proc/sys/kernel/nmi */ -int proc_nmi_enabled(struct ctl_table *table, int write, struct file *file, +int proc_nmi_enabled(struct ctl_table *table, int write, void __user *buffer, size_t *length, loff_t *ppos) { int old_state; nmi_watchdog_enabled = (atomic_read(&nmi_active) > 0) ? 1 : 0; old_state = nmi_watchdog_enabled; - proc_dointvec(table, write, file, buffer, length, ppos); + proc_dointvec(table, write, buffer, length, ppos); if (!!old_state == !!nmi_watchdog_enabled) return 0; diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c index cf53a78e2dcf..8cb4974ff599 100644 --- a/arch/x86/kernel/vsyscall_64.c +++ b/arch/x86/kernel/vsyscall_64.c @@ -228,19 +228,11 @@ static long __vsyscall(3) venosys_1(void) } #ifdef CONFIG_SYSCTL - -static int -vsyscall_sysctl_change(ctl_table *ctl, int write, struct file * filp, - void __user *buffer, size_t *lenp, loff_t *ppos) -{ - return proc_dointvec(ctl, write, filp, buffer, lenp, ppos); -} - static ctl_table kernel_table2[] = { { .procname = "vsyscall64", .data = &vsyscall_gtod_data.sysctl_enabled, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = vsyscall_sysctl_change }, + .proc_handler = proc_dointvec }, {} }; diff --git a/drivers/cdrom/cdrom.c b/drivers/cdrom/cdrom.c index 71d1b9bab70b..614da5b8613a 100644 --- a/drivers/cdrom/cdrom.c +++ b/drivers/cdrom/cdrom.c @@ -3412,7 +3412,7 @@ static int cdrom_print_info(const char *header, int val, char *info, return 0; } -static int cdrom_sysctl_info(ctl_table *ctl, int write, struct file * filp, +static int cdrom_sysctl_info(ctl_table *ctl, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { int pos; @@ -3489,7 +3489,7 @@ static int cdrom_sysctl_info(ctl_table *ctl, int write, struct file * filp, goto done; doit: mutex_unlock(&cdrom_mutex); - return proc_dostring(ctl, write, filp, buffer, lenp, ppos); + return proc_dostring(ctl, write, buffer, lenp, ppos); done: printk(KERN_INFO "cdrom: info buffer too small\n"); goto doit; @@ -3525,12 +3525,12 @@ static void cdrom_update_settings(void) mutex_unlock(&cdrom_mutex); } -static int cdrom_sysctl_handler(ctl_table *ctl, int write, struct file * filp, +static int cdrom_sysctl_handler(ctl_table *ctl, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { int ret; - ret = proc_dointvec(ctl, write, filp, buffer, lenp, ppos); + ret = proc_dointvec(ctl, write, buffer, lenp, ppos); if (write) { diff --git a/drivers/char/random.c b/drivers/char/random.c index d8a9255e1a3f..04b505e5a5e2 100644 --- a/drivers/char/random.c +++ b/drivers/char/random.c @@ -1231,7 +1231,7 @@ static char sysctl_bootid[16]; * as an ASCII string in the standard UUID format. If accesses via the * sysctl system call, it is returned as 16 bytes of binary data. */ -static int proc_do_uuid(ctl_table *table, int write, struct file *filp, +static int proc_do_uuid(ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { ctl_table fake_table; @@ -1254,7 +1254,7 @@ static int proc_do_uuid(ctl_table *table, int write, struct file *filp, fake_table.data = buf; fake_table.maxlen = sizeof(buf); - return proc_dostring(&fake_table, write, filp, buffer, lenp, ppos); + return proc_dostring(&fake_table, write, buffer, lenp, ppos); } static int uuid_strategy(ctl_table *table, diff --git a/drivers/net/wireless/arlan-proc.c b/drivers/net/wireless/arlan-proc.c index 2ab1d59870f4..a8b689635a3b 100644 --- a/drivers/net/wireless/arlan-proc.c +++ b/drivers/net/wireless/arlan-proc.c @@ -402,7 +402,7 @@ static int arlan_setup_card_by_book(struct net_device *dev) static char arlan_drive_info[ARLAN_STR_SIZE] = "A655\n\0"; -static int arlan_sysctl_info(ctl_table * ctl, int write, struct file *filp, +static int arlan_sysctl_info(ctl_table * ctl, int write, void __user *buffer, size_t * lenp, loff_t *ppos) { int i; @@ -629,7 +629,7 @@ final: *lenp = pos; if (!write) - retv = proc_dostring(ctl, write, filp, buffer, lenp, ppos); + retv = proc_dostring(ctl, write, buffer, lenp, ppos); else { *lenp = 0; @@ -639,7 +639,7 @@ final: } -static int arlan_sysctl_info161719(ctl_table * ctl, int write, struct file *filp, +static int arlan_sysctl_info161719(ctl_table * ctl, int write, void __user *buffer, size_t * lenp, loff_t *ppos) { int i; @@ -669,11 +669,11 @@ static int arlan_sysctl_info161719(ctl_table * ctl, int write, struct file *filp final: *lenp = pos; - retv = proc_dostring(ctl, write, filp, buffer, lenp, ppos); + retv = proc_dostring(ctl, write, buffer, lenp, ppos); return retv; } -static int arlan_sysctl_infotxRing(ctl_table * ctl, int write, struct file *filp, +static int arlan_sysctl_infotxRing(ctl_table * ctl, int write, void __user *buffer, size_t * lenp, loff_t *ppos) { int i; @@ -698,11 +698,11 @@ static int arlan_sysctl_infotxRing(ctl_table * ctl, int write, struct file *filp SARLBNpln(u_char, txBuffer, 0x800); final: *lenp = pos; - retv = proc_dostring(ctl, write, filp, buffer, lenp, ppos); + retv = proc_dostring(ctl, write, buffer, lenp, ppos); return retv; } -static int arlan_sysctl_inforxRing(ctl_table * ctl, int write, struct file *filp, +static int arlan_sysctl_inforxRing(ctl_table * ctl, int write, void __user *buffer, size_t * lenp, loff_t *ppos) { int i; @@ -726,11 +726,11 @@ static int arlan_sysctl_inforxRing(ctl_table * ctl, int write, struct file *filp SARLBNpln(u_char, rxBuffer, 0x800); final: *lenp = pos; - retv = proc_dostring(ctl, write, filp, buffer, lenp, ppos); + retv = proc_dostring(ctl, write, buffer, lenp, ppos); return retv; } -static int arlan_sysctl_info18(ctl_table * ctl, int write, struct file *filp, +static int arlan_sysctl_info18(ctl_table * ctl, int write, void __user *buffer, size_t * lenp, loff_t *ppos) { int i; @@ -756,7 +756,7 @@ static int arlan_sysctl_info18(ctl_table * ctl, int write, struct file *filp, final: *lenp = pos; - retv = proc_dostring(ctl, write, filp, buffer, lenp, ppos); + retv = proc_dostring(ctl, write, buffer, lenp, ppos); return retv; } @@ -766,7 +766,7 @@ final: static char conf_reset_result[200]; -static int arlan_configure(ctl_table * ctl, int write, struct file *filp, +static int arlan_configure(ctl_table * ctl, int write, void __user *buffer, size_t * lenp, loff_t *ppos) { int pos = 0; @@ -788,10 +788,10 @@ static int arlan_configure(ctl_table * ctl, int write, struct file *filp, return -1; *lenp = pos; - return proc_dostring(ctl, write, filp, buffer, lenp, ppos); + return proc_dostring(ctl, write, buffer, lenp, ppos); } -static int arlan_sysctl_reset(ctl_table * ctl, int write, struct file *filp, +static int arlan_sysctl_reset(ctl_table * ctl, int write, void __user *buffer, size_t * lenp, loff_t *ppos) { int pos = 0; @@ -811,7 +811,7 @@ static int arlan_sysctl_reset(ctl_table * ctl, int write, struct file *filp, } else return -1; *lenp = pos + 3; - return proc_dostring(ctl, write, filp, buffer, lenp, ppos); + return proc_dostring(ctl, write, buffer, lenp, ppos); } diff --git a/drivers/parport/procfs.c b/drivers/parport/procfs.c index 554e11f9e1ce..8eefe56f1cbe 100644 --- a/drivers/parport/procfs.c +++ b/drivers/parport/procfs.c @@ -31,7 +31,7 @@ #define PARPORT_MIN_SPINTIME_VALUE 1 #define PARPORT_MAX_SPINTIME_VALUE 1000 -static int do_active_device(ctl_table *table, int write, struct file *filp, +static int do_active_device(ctl_table *table, int write, void __user *result, size_t *lenp, loff_t *ppos) { struct parport *port = (struct parport *)table->extra1; @@ -68,7 +68,7 @@ static int do_active_device(ctl_table *table, int write, struct file *filp, } #ifdef CONFIG_PARPORT_1284 -static int do_autoprobe(ctl_table *table, int write, struct file *filp, +static int do_autoprobe(ctl_table *table, int write, void __user *result, size_t *lenp, loff_t *ppos) { struct parport_device_info *info = table->extra2; @@ -111,7 +111,7 @@ static int do_autoprobe(ctl_table *table, int write, struct file *filp, #endif /* IEEE1284.3 support. */ static int do_hardware_base_addr (ctl_table *table, int write, - struct file *filp, void __user *result, + void __user *result, size_t *lenp, loff_t *ppos) { struct parport *port = (struct parport *)table->extra1; @@ -139,7 +139,7 @@ static int do_hardware_base_addr (ctl_table *table, int write, } static int do_hardware_irq (ctl_table *table, int write, - struct file *filp, void __user *result, + void __user *result, size_t *lenp, loff_t *ppos) { struct parport *port = (struct parport *)table->extra1; @@ -167,7 +167,7 @@ static int do_hardware_irq (ctl_table *table, int write, } static int do_hardware_dma (ctl_table *table, int write, - struct file *filp, void __user *result, + void __user *result, size_t *lenp, loff_t *ppos) { struct parport *port = (struct parport *)table->extra1; @@ -195,7 +195,7 @@ static int do_hardware_dma (ctl_table *table, int write, } static int do_hardware_modes (ctl_table *table, int write, - struct file *filp, void __user *result, + void __user *result, size_t *lenp, loff_t *ppos) { struct parport *port = (struct parport *)table->extra1; diff --git a/fs/coda/coda_int.h b/fs/coda/coda_int.h index 8ccd5ed81d9c..d99860a33890 100644 --- a/fs/coda/coda_int.h +++ b/fs/coda/coda_int.h @@ -2,6 +2,7 @@ #define _CODA_INT_ struct dentry; +struct file; extern struct file_system_type coda_fs_type; extern unsigned long coda_timeout; diff --git a/fs/drop_caches.c b/fs/drop_caches.c index a2edb7913447..31f4b0e6d72c 100644 --- a/fs/drop_caches.c +++ b/fs/drop_caches.c @@ -63,9 +63,9 @@ static void drop_slab(void) } int drop_caches_sysctl_handler(ctl_table *table, int write, - struct file *file, void __user *buffer, size_t *length, loff_t *ppos) + void __user *buffer, size_t *length, loff_t *ppos) { - proc_dointvec_minmax(table, write, file, buffer, length, ppos); + proc_dointvec_minmax(table, write, buffer, length, ppos); if (write) { if (sysctl_drop_caches & 1) drop_pagecache(); diff --git a/fs/file_table.c b/fs/file_table.c index 334ce39881f8..8eb44042e009 100644 --- a/fs/file_table.c +++ b/fs/file_table.c @@ -74,14 +74,14 @@ EXPORT_SYMBOL_GPL(get_max_files); * Handle nr_files sysctl */ #if defined(CONFIG_SYSCTL) && defined(CONFIG_PROC_FS) -int proc_nr_files(ctl_table *table, int write, struct file *filp, +int proc_nr_files(ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { files_stat.nr_files = get_nr_files(); - return proc_dointvec(table, write, filp, buffer, lenp, ppos); + return proc_dointvec(table, write, buffer, lenp, ppos); } #else -int proc_nr_files(ctl_table *table, int write, struct file *filp, +int proc_nr_files(ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { return -ENOSYS; diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c index 9b1e4e9a16bf..f667e8aeabdf 100644 --- a/fs/proc/proc_sysctl.c +++ b/fs/proc/proc_sysctl.c @@ -153,7 +153,7 @@ static ssize_t proc_sys_call_handler(struct file *filp, void __user *buf, /* careful: calling conventions are nasty here */ res = count; - error = table->proc_handler(table, write, filp, buf, &res, ppos); + error = table->proc_handler(table, write, buf, &res, ppos); if (!error) error = res; out: diff --git a/fs/xfs/linux-2.6/xfs_sysctl.c b/fs/xfs/linux-2.6/xfs_sysctl.c index 916c0ffb6083..c5bc67c4e3bb 100644 --- a/fs/xfs/linux-2.6/xfs_sysctl.c +++ b/fs/xfs/linux-2.6/xfs_sysctl.c @@ -26,7 +26,6 @@ STATIC int xfs_stats_clear_proc_handler( ctl_table *ctl, int write, - struct file *filp, void __user *buffer, size_t *lenp, loff_t *ppos) @@ -34,7 +33,7 @@ xfs_stats_clear_proc_handler( int c, ret, *valp = ctl->data; __uint32_t vn_active; - ret = proc_dointvec_minmax(ctl, write, filp, buffer, lenp, ppos); + ret = proc_dointvec_minmax(ctl, write, buffer, lenp, ppos); if (!ret && write && *valp) { printk("XFS Clearing xfsstats\n"); diff --git a/include/linux/fs.h b/include/linux/fs.h index 51803528b095..33ed6644abd0 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2467,7 +2467,7 @@ ssize_t simple_attr_write(struct file *file, const char __user *buf, size_t len, loff_t *ppos); struct ctl_table; -int proc_nr_files(struct ctl_table *table, int write, struct file *filp, +int proc_nr_files(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos); int __init get_filesystem_list(char *buf); diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 3c0924a18daf..cd3d2abaf30a 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -19,7 +19,7 @@ extern int ftrace_enabled; extern int ftrace_enable_sysctl(struct ctl_table *table, int write, - struct file *filp, void __user *buffer, size_t *lenp, + void __user *buffer, size_t *lenp, loff_t *ppos); typedef void (*ftrace_func_t)(unsigned long ip, unsigned long parent_ip); @@ -94,7 +94,7 @@ static inline void ftrace_start(void) { } extern int stack_tracer_enabled; int stack_trace_sysctl(struct ctl_table *table, int write, - struct file *file, void __user *buffer, size_t *lenp, + void __user *buffer, size_t *lenp, loff_t *ppos); #endif diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 176e7ee73eff..11ab19ac6b3d 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -20,9 +20,9 @@ static inline int is_vm_hugetlb_page(struct vm_area_struct *vma) } void reset_vma_resv_huge_pages(struct vm_area_struct *vma); -int hugetlb_sysctl_handler(struct ctl_table *, int, struct file *, void __user *, size_t *, loff_t *); -int hugetlb_overcommit_handler(struct ctl_table *, int, struct file *, void __user *, size_t *, loff_t *); -int hugetlb_treat_movable_handler(struct ctl_table *, int, struct file *, void __user *, size_t *, loff_t *); +int hugetlb_sysctl_handler(struct ctl_table *, int, void __user *, size_t *, loff_t *); +int hugetlb_overcommit_handler(struct ctl_table *, int, void __user *, size_t *, loff_t *); +int hugetlb_treat_movable_handler(struct ctl_table *, int, void __user *, size_t *, loff_t *); int copy_hugetlb_page_range(struct mm_struct *, struct mm_struct *, struct vm_area_struct *); int follow_hugetlb_page(struct mm_struct *, struct vm_area_struct *, struct page **, struct vm_area_struct **, diff --git a/include/linux/mm.h b/include/linux/mm.h index b6eae5e3144b..87218ae84e36 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1279,7 +1279,7 @@ int in_gate_area_no_task(unsigned long addr); #define in_gate_area(task, addr) ({(void)task; in_gate_area_no_task(addr);}) #endif /* __HAVE_ARCH_GATE_AREA */ -int drop_caches_sysctl_handler(struct ctl_table *, int, struct file *, +int drop_caches_sysctl_handler(struct ctl_table *, int, void __user *, size_t *, loff_t *); unsigned long shrink_slab(unsigned long scanned, gfp_t gfp_mask, unsigned long lru_pages); diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 652ef01be582..6f7561730d88 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -755,21 +755,20 @@ static inline int is_dma(struct zone *zone) /* These two functions are used to setup the per zone pages min values */ struct ctl_table; -struct file; -int min_free_kbytes_sysctl_handler(struct ctl_table *, int, struct file *, +int min_free_kbytes_sysctl_handler(struct ctl_table *, int, void __user *, size_t *, loff_t *); extern int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES-1]; -int lowmem_reserve_ratio_sysctl_handler(struct ctl_table *, int, struct file *, +int lowmem_reserve_ratio_sysctl_handler(struct ctl_table *, int, void __user *, size_t *, loff_t *); -int percpu_pagelist_fraction_sysctl_handler(struct ctl_table *, int, struct file *, +int percpu_pagelist_fraction_sysctl_handler(struct ctl_table *, int, void __user *, size_t *, loff_t *); int sysctl_min_unmapped_ratio_sysctl_handler(struct ctl_table *, int, - struct file *, void __user *, size_t *, loff_t *); + void __user *, size_t *, loff_t *); int sysctl_min_slab_ratio_sysctl_handler(struct ctl_table *, int, - struct file *, void __user *, size_t *, loff_t *); + void __user *, size_t *, loff_t *); extern int numa_zonelist_order_handler(struct ctl_table *, int, - struct file *, void __user *, size_t *, loff_t *); + void __user *, size_t *, loff_t *); extern char numa_zonelist_order[]; #define NUMA_ZONELIST_ORDER_LEN 16 /* string buffer size */ diff --git a/include/linux/sched.h b/include/linux/sched.h index e951bd2eb9fc..811cd96524d7 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -309,7 +309,7 @@ extern void softlockup_tick(void); extern void touch_softlockup_watchdog(void); extern void touch_all_softlockup_watchdogs(void); extern int proc_dosoftlockup_thresh(struct ctl_table *table, int write, - struct file *filp, void __user *buffer, + void __user *buffer, size_t *lenp, loff_t *ppos); extern unsigned int softlockup_panic; extern int softlockup_thresh; @@ -331,7 +331,7 @@ extern unsigned long sysctl_hung_task_check_count; extern unsigned long sysctl_hung_task_timeout_secs; extern unsigned long sysctl_hung_task_warnings; extern int proc_dohung_task_timeout_secs(struct ctl_table *table, int write, - struct file *filp, void __user *buffer, + void __user *buffer, size_t *lenp, loff_t *ppos); #endif @@ -1906,7 +1906,7 @@ extern unsigned int sysctl_sched_time_avg; extern unsigned int sysctl_timer_migration; int sched_nr_latency_handler(struct ctl_table *table, int write, - struct file *file, void __user *buffer, size_t *length, + void __user *buffer, size_t *length, loff_t *ppos); #endif #ifdef CONFIG_SCHED_DEBUG @@ -1924,7 +1924,7 @@ extern unsigned int sysctl_sched_rt_period; extern int sysctl_sched_rt_runtime; int sched_rt_handler(struct ctl_table *table, int write, - struct file *filp, void __user *buffer, size_t *lenp, + void __user *buffer, size_t *lenp, loff_t *ppos); extern unsigned int sysctl_sched_compat_yield; diff --git a/include/linux/security.h b/include/linux/security.h index d050b66ab9ef..239e40d0450b 100644 --- a/include/linux/security.h +++ b/include/linux/security.h @@ -133,7 +133,7 @@ static inline unsigned long round_hint_to_min(unsigned long hint) return PAGE_ALIGN(mmap_min_addr); return hint; } -extern int mmap_min_addr_handler(struct ctl_table *table, int write, struct file *filp, +extern int mmap_min_addr_handler(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos); #ifdef CONFIG_SECURITY diff --git a/include/linux/swap.h b/include/linux/swap.h index 4c78fea989b9..82232dbea3f7 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -245,7 +245,7 @@ extern int page_evictable(struct page *page, struct vm_area_struct *vma); extern void scan_mapping_unevictable_pages(struct address_space *); extern unsigned long scan_unevictable_pages; -extern int scan_unevictable_handler(struct ctl_table *, int, struct file *, +extern int scan_unevictable_handler(struct ctl_table *, int, void __user *, size_t *, loff_t *); extern int scan_unevictable_register_node(struct node *node); extern void scan_unevictable_unregister_node(struct node *node); diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h index e76d3b22a466..1e4743ee6831 100644 --- a/include/linux/sysctl.h +++ b/include/linux/sysctl.h @@ -29,7 +29,6 @@ #include #include -struct file; struct completion; #define CTL_MAXNAME 10 /* how many path components do we allow in a @@ -977,25 +976,25 @@ typedef int ctl_handler (struct ctl_table *table, void __user *oldval, size_t __user *oldlenp, void __user *newval, size_t newlen); -typedef int proc_handler (struct ctl_table *ctl, int write, struct file * filp, +typedef int proc_handler (struct ctl_table *ctl, int write, void __user *buffer, size_t *lenp, loff_t *ppos); -extern int proc_dostring(struct ctl_table *, int, struct file *, +extern int proc_dostring(struct ctl_table *, int, void __user *, size_t *, loff_t *); -extern int proc_dointvec(struct ctl_table *, int, struct file *, +extern int proc_dointvec(struct ctl_table *, int, void __user *, size_t *, loff_t *); -extern int proc_dointvec_minmax(struct ctl_table *, int, struct file *, +extern int proc_dointvec_minmax(struct ctl_table *, int, void __user *, size_t *, loff_t *); -extern int proc_dointvec_jiffies(struct ctl_table *, int, struct file *, +extern int proc_dointvec_jiffies(struct ctl_table *, int, void __user *, size_t *, loff_t *); -extern int proc_dointvec_userhz_jiffies(struct ctl_table *, int, struct file *, +extern int proc_dointvec_userhz_jiffies(struct ctl_table *, int, void __user *, size_t *, loff_t *); -extern int proc_dointvec_ms_jiffies(struct ctl_table *, int, struct file *, +extern int proc_dointvec_ms_jiffies(struct ctl_table *, int, void __user *, size_t *, loff_t *); -extern int proc_doulongvec_minmax(struct ctl_table *, int, struct file *, +extern int proc_doulongvec_minmax(struct ctl_table *, int, void __user *, size_t *, loff_t *); extern int proc_doulongvec_ms_jiffies_minmax(struct ctl_table *table, int, - struct file *, void __user *, size_t *, loff_t *); + void __user *, size_t *, loff_t *); extern int do_sysctl (int __user *name, int nlen, void __user *oldval, size_t __user *oldlenp, diff --git a/include/linux/writeback.h b/include/linux/writeback.h index 75cf58666ff9..66ebddcff664 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -110,21 +110,20 @@ extern int laptop_mode; extern unsigned long determine_dirtyable_memory(void); extern int dirty_background_ratio_handler(struct ctl_table *table, int write, - struct file *filp, void __user *buffer, size_t *lenp, + void __user *buffer, size_t *lenp, loff_t *ppos); extern int dirty_background_bytes_handler(struct ctl_table *table, int write, - struct file *filp, void __user *buffer, size_t *lenp, + void __user *buffer, size_t *lenp, loff_t *ppos); extern int dirty_ratio_handler(struct ctl_table *table, int write, - struct file *filp, void __user *buffer, size_t *lenp, + void __user *buffer, size_t *lenp, loff_t *ppos); extern int dirty_bytes_handler(struct ctl_table *table, int write, - struct file *filp, void __user *buffer, size_t *lenp, + void __user *buffer, size_t *lenp, loff_t *ppos); struct ctl_table; -struct file; -int dirty_writeback_centisecs_handler(struct ctl_table *, int, struct file *, +int dirty_writeback_centisecs_handler(struct ctl_table *, int, void __user *, size_t *, loff_t *); void get_dirty_limits(unsigned long *pbackground, unsigned long *pdirty, diff --git a/include/net/ip.h b/include/net/ip.h index 72c36926c26d..5b26a0bd178e 100644 --- a/include/net/ip.h +++ b/include/net/ip.h @@ -399,7 +399,7 @@ extern void ip_local_error(struct sock *sk, int err, __be32 daddr, __be16 dport, * fed into the routing cache should use these handlers. */ int ipv4_doint_and_flush(ctl_table *ctl, int write, - struct file* filp, void __user *buffer, + void __user *buffer, size_t *lenp, loff_t *ppos); int ipv4_doint_and_flush_strategy(ctl_table *table, void __user *oldval, size_t __user *oldlenp, diff --git a/include/net/ndisc.h b/include/net/ndisc.h index 1459ed3e2697..f76f22d05721 100644 --- a/include/net/ndisc.h +++ b/include/net/ndisc.h @@ -55,7 +55,6 @@ enum { #include struct ctl_table; -struct file; struct inet6_dev; struct net_device; struct net_proto_family; @@ -139,7 +138,6 @@ extern int igmp6_event_report(struct sk_buff *skb); #ifdef CONFIG_SYSCTL extern int ndisc_ifinfo_sysctl_change(struct ctl_table *ctl, int write, - struct file * filp, void __user *buffer, size_t *lenp, loff_t *ppos); diff --git a/ipc/ipc_sysctl.c b/ipc/ipc_sysctl.c index 40eab7314aeb..7d3704750efc 100644 --- a/ipc/ipc_sysctl.c +++ b/ipc/ipc_sysctl.c @@ -27,18 +27,18 @@ static void *get_ipc(ctl_table *table) } #ifdef CONFIG_PROC_SYSCTL -static int proc_ipc_dointvec(ctl_table *table, int write, struct file *filp, +static int proc_ipc_dointvec(ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { struct ctl_table ipc_table; memcpy(&ipc_table, table, sizeof(ipc_table)); ipc_table.data = get_ipc(table); - return proc_dointvec(&ipc_table, write, filp, buffer, lenp, ppos); + return proc_dointvec(&ipc_table, write, buffer, lenp, ppos); } static int proc_ipc_callback_dointvec(ctl_table *table, int write, - struct file *filp, void __user *buffer, size_t *lenp, loff_t *ppos) + void __user *buffer, size_t *lenp, loff_t *ppos) { struct ctl_table ipc_table; size_t lenp_bef = *lenp; @@ -47,7 +47,7 @@ static int proc_ipc_callback_dointvec(ctl_table *table, int write, memcpy(&ipc_table, table, sizeof(ipc_table)); ipc_table.data = get_ipc(table); - rc = proc_dointvec(&ipc_table, write, filp, buffer, lenp, ppos); + rc = proc_dointvec(&ipc_table, write, buffer, lenp, ppos); if (write && !rc && lenp_bef == *lenp) /* @@ -61,13 +61,13 @@ static int proc_ipc_callback_dointvec(ctl_table *table, int write, } static int proc_ipc_doulongvec_minmax(ctl_table *table, int write, - struct file *filp, void __user *buffer, size_t *lenp, loff_t *ppos) + void __user *buffer, size_t *lenp, loff_t *ppos) { struct ctl_table ipc_table; memcpy(&ipc_table, table, sizeof(ipc_table)); ipc_table.data = get_ipc(table); - return proc_doulongvec_minmax(&ipc_table, write, filp, buffer, + return proc_doulongvec_minmax(&ipc_table, write, buffer, lenp, ppos); } @@ -95,7 +95,7 @@ static void ipc_auto_callback(int val) } static int proc_ipcauto_dointvec_minmax(ctl_table *table, int write, - struct file *filp, void __user *buffer, size_t *lenp, loff_t *ppos) + void __user *buffer, size_t *lenp, loff_t *ppos) { struct ctl_table ipc_table; size_t lenp_bef = *lenp; @@ -106,7 +106,7 @@ static int proc_ipcauto_dointvec_minmax(ctl_table *table, int write, ipc_table.data = get_ipc(table); oldval = *((int *)(ipc_table.data)); - rc = proc_dointvec_minmax(&ipc_table, write, filp, buffer, lenp, ppos); + rc = proc_dointvec_minmax(&ipc_table, write, buffer, lenp, ppos); if (write && !rc && lenp_bef == *lenp) { int newval = *((int *)(ipc_table.data)); diff --git a/ipc/mq_sysctl.c b/ipc/mq_sysctl.c index 24ae46dfe45d..8a058711fc10 100644 --- a/ipc/mq_sysctl.c +++ b/ipc/mq_sysctl.c @@ -31,24 +31,24 @@ static void *get_mq(ctl_table *table) return which; } -static int proc_mq_dointvec(ctl_table *table, int write, struct file *filp, +static int proc_mq_dointvec(ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { struct ctl_table mq_table; memcpy(&mq_table, table, sizeof(mq_table)); mq_table.data = get_mq(table); - return proc_dointvec(&mq_table, write, filp, buffer, lenp, ppos); + return proc_dointvec(&mq_table, write, buffer, lenp, ppos); } static int proc_mq_dointvec_minmax(ctl_table *table, int write, - struct file *filp, void __user *buffer, size_t *lenp, loff_t *ppos) + void __user *buffer, size_t *lenp, loff_t *ppos) { struct ctl_table mq_table; memcpy(&mq_table, table, sizeof(mq_table)); mq_table.data = get_mq(table); - return proc_dointvec_minmax(&mq_table, write, filp, buffer, + return proc_dointvec_minmax(&mq_table, write, buffer, lenp, ppos); } #else diff --git a/kernel/hung_task.c b/kernel/hung_task.c index 022a4927b785..d4e841747400 100644 --- a/kernel/hung_task.c +++ b/kernel/hung_task.c @@ -171,12 +171,12 @@ static unsigned long timeout_jiffies(unsigned long timeout) * Process updating of timeout sysctl */ int proc_dohung_task_timeout_secs(struct ctl_table *table, int write, - struct file *filp, void __user *buffer, + void __user *buffer, size_t *lenp, loff_t *ppos) { int ret; - ret = proc_doulongvec_minmax(table, write, filp, buffer, lenp, ppos); + ret = proc_doulongvec_minmax(table, write, buffer, lenp, ppos); if (ret || !write) goto out; diff --git a/kernel/sched.c b/kernel/sched.c index 0d0361b9dbb3..ee61f454a98b 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -10312,7 +10312,7 @@ static int sched_rt_global_constraints(void) #endif /* CONFIG_RT_GROUP_SCHED */ int sched_rt_handler(struct ctl_table *table, int write, - struct file *filp, void __user *buffer, size_t *lenp, + void __user *buffer, size_t *lenp, loff_t *ppos) { int ret; @@ -10323,7 +10323,7 @@ int sched_rt_handler(struct ctl_table *table, int write, old_period = sysctl_sched_rt_period; old_runtime = sysctl_sched_rt_runtime; - ret = proc_dointvec(table, write, filp, buffer, lenp, ppos); + ret = proc_dointvec(table, write, buffer, lenp, ppos); if (!ret && write) { ret = sched_rt_global_constraints(); diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index ecc637a0d591..4e777b47eeda 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -384,10 +384,10 @@ static struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq) #ifdef CONFIG_SCHED_DEBUG int sched_nr_latency_handler(struct ctl_table *table, int write, - struct file *filp, void __user *buffer, size_t *lenp, + void __user *buffer, size_t *lenp, loff_t *ppos) { - int ret = proc_dointvec_minmax(table, write, filp, buffer, lenp, ppos); + int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); if (ret || !write) return ret; diff --git a/kernel/slow-work.c b/kernel/slow-work.c index 09d7519557d3..0d31135efbf4 100644 --- a/kernel/slow-work.c +++ b/kernel/slow-work.c @@ -26,10 +26,10 @@ static void slow_work_cull_timeout(unsigned long); static void slow_work_oom_timeout(unsigned long); #ifdef CONFIG_SYSCTL -static int slow_work_min_threads_sysctl(struct ctl_table *, int, struct file *, +static int slow_work_min_threads_sysctl(struct ctl_table *, int, void __user *, size_t *, loff_t *); -static int slow_work_max_threads_sysctl(struct ctl_table *, int , struct file *, +static int slow_work_max_threads_sysctl(struct ctl_table *, int , void __user *, size_t *, loff_t *); #endif @@ -493,10 +493,10 @@ static void slow_work_oom_timeout(unsigned long data) * Handle adjustment of the minimum number of threads */ static int slow_work_min_threads_sysctl(struct ctl_table *table, int write, - struct file *filp, void __user *buffer, + void __user *buffer, size_t *lenp, loff_t *ppos) { - int ret = proc_dointvec_minmax(table, write, filp, buffer, lenp, ppos); + int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); int n; if (ret == 0) { @@ -521,10 +521,10 @@ static int slow_work_min_threads_sysctl(struct ctl_table *table, int write, * Handle adjustment of the maximum number of threads */ static int slow_work_max_threads_sysctl(struct ctl_table *table, int write, - struct file *filp, void __user *buffer, + void __user *buffer, size_t *lenp, loff_t *ppos) { - int ret = proc_dointvec_minmax(table, write, filp, buffer, lenp, ppos); + int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); int n; if (ret == 0) { diff --git a/kernel/softlockup.c b/kernel/softlockup.c index 88796c330838..81324d12eb35 100644 --- a/kernel/softlockup.c +++ b/kernel/softlockup.c @@ -90,11 +90,11 @@ void touch_all_softlockup_watchdogs(void) EXPORT_SYMBOL(touch_all_softlockup_watchdogs); int proc_dosoftlockup_thresh(struct ctl_table *table, int write, - struct file *filp, void __user *buffer, + void __user *buffer, size_t *lenp, loff_t *ppos) { touch_all_softlockup_watchdogs(); - return proc_dointvec_minmax(table, write, filp, buffer, lenp, ppos); + return proc_dointvec_minmax(table, write, buffer, lenp, ppos); } /* diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 37abb8c3995b..a02697b7cb97 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -163,9 +163,9 @@ extern int max_lock_depth; #endif #ifdef CONFIG_PROC_SYSCTL -static int proc_do_cad_pid(struct ctl_table *table, int write, struct file *filp, +static int proc_do_cad_pid(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos); -static int proc_taint(struct ctl_table *table, int write, struct file *filp, +static int proc_taint(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos); #endif @@ -2226,7 +2226,7 @@ void sysctl_head_put(struct ctl_table_header *head) #ifdef CONFIG_PROC_SYSCTL static int _proc_do_string(void* data, int maxlen, int write, - struct file *filp, void __user *buffer, + void __user *buffer, size_t *lenp, loff_t *ppos) { size_t len; @@ -2287,7 +2287,6 @@ static int _proc_do_string(void* data, int maxlen, int write, * proc_dostring - read a string sysctl * @table: the sysctl table * @write: %TRUE if this is a write to the sysctl file - * @filp: the file structure * @buffer: the user buffer * @lenp: the size of the user buffer * @ppos: file position @@ -2301,10 +2300,10 @@ static int _proc_do_string(void* data, int maxlen, int write, * * Returns 0 on success. */ -int proc_dostring(struct ctl_table *table, int write, struct file *filp, +int proc_dostring(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { - return _proc_do_string(table->data, table->maxlen, write, filp, + return _proc_do_string(table->data, table->maxlen, write, buffer, lenp, ppos); } @@ -2329,7 +2328,7 @@ static int do_proc_dointvec_conv(int *negp, unsigned long *lvalp, } static int __do_proc_dointvec(void *tbl_data, struct ctl_table *table, - int write, struct file *filp, void __user *buffer, + int write, void __user *buffer, size_t *lenp, loff_t *ppos, int (*conv)(int *negp, unsigned long *lvalp, int *valp, int write, void *data), @@ -2436,13 +2435,13 @@ static int __do_proc_dointvec(void *tbl_data, struct ctl_table *table, #undef TMPBUFLEN } -static int do_proc_dointvec(struct ctl_table *table, int write, struct file *filp, +static int do_proc_dointvec(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos, int (*conv)(int *negp, unsigned long *lvalp, int *valp, int write, void *data), void *data) { - return __do_proc_dointvec(table->data, table, write, filp, + return __do_proc_dointvec(table->data, table, write, buffer, lenp, ppos, conv, data); } @@ -2450,7 +2449,6 @@ static int do_proc_dointvec(struct ctl_table *table, int write, struct file *fil * proc_dointvec - read a vector of integers * @table: the sysctl table * @write: %TRUE if this is a write to the sysctl file - * @filp: the file structure * @buffer: the user buffer * @lenp: the size of the user buffer * @ppos: file position @@ -2460,10 +2458,10 @@ static int do_proc_dointvec(struct ctl_table *table, int write, struct file *fil * * Returns 0 on success. */ -int proc_dointvec(struct ctl_table *table, int write, struct file *filp, +int proc_dointvec(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { - return do_proc_dointvec(table,write,filp,buffer,lenp,ppos, + return do_proc_dointvec(table,write,buffer,lenp,ppos, NULL,NULL); } @@ -2471,7 +2469,7 @@ int proc_dointvec(struct ctl_table *table, int write, struct file *filp, * Taint values can only be increased * This means we can safely use a temporary. */ -static int proc_taint(struct ctl_table *table, int write, struct file *filp, +static int proc_taint(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { struct ctl_table t; @@ -2483,7 +2481,7 @@ static int proc_taint(struct ctl_table *table, int write, struct file *filp, t = *table; t.data = &tmptaint; - err = proc_doulongvec_minmax(&t, write, filp, buffer, lenp, ppos); + err = proc_doulongvec_minmax(&t, write, buffer, lenp, ppos); if (err < 0) return err; @@ -2535,7 +2533,6 @@ static int do_proc_dointvec_minmax_conv(int *negp, unsigned long *lvalp, * proc_dointvec_minmax - read a vector of integers with min/max values * @table: the sysctl table * @write: %TRUE if this is a write to the sysctl file - * @filp: the file structure * @buffer: the user buffer * @lenp: the size of the user buffer * @ppos: file position @@ -2548,19 +2545,18 @@ static int do_proc_dointvec_minmax_conv(int *negp, unsigned long *lvalp, * * Returns 0 on success. */ -int proc_dointvec_minmax(struct ctl_table *table, int write, struct file *filp, +int proc_dointvec_minmax(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { struct do_proc_dointvec_minmax_conv_param param = { .min = (int *) table->extra1, .max = (int *) table->extra2, }; - return do_proc_dointvec(table, write, filp, buffer, lenp, ppos, + return do_proc_dointvec(table, write, buffer, lenp, ppos, do_proc_dointvec_minmax_conv, ¶m); } static int __do_proc_doulongvec_minmax(void *data, struct ctl_table *table, int write, - struct file *filp, void __user *buffer, size_t *lenp, loff_t *ppos, unsigned long convmul, @@ -2665,21 +2661,19 @@ static int __do_proc_doulongvec_minmax(void *data, struct ctl_table *table, int } static int do_proc_doulongvec_minmax(struct ctl_table *table, int write, - struct file *filp, void __user *buffer, size_t *lenp, loff_t *ppos, unsigned long convmul, unsigned long convdiv) { return __do_proc_doulongvec_minmax(table->data, table, write, - filp, buffer, lenp, ppos, convmul, convdiv); + buffer, lenp, ppos, convmul, convdiv); } /** * proc_doulongvec_minmax - read a vector of long integers with min/max values * @table: the sysctl table * @write: %TRUE if this is a write to the sysctl file - * @filp: the file structure * @buffer: the user buffer * @lenp: the size of the user buffer * @ppos: file position @@ -2692,17 +2686,16 @@ static int do_proc_doulongvec_minmax(struct ctl_table *table, int write, * * Returns 0 on success. */ -int proc_doulongvec_minmax(struct ctl_table *table, int write, struct file *filp, +int proc_doulongvec_minmax(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { - return do_proc_doulongvec_minmax(table, write, filp, buffer, lenp, ppos, 1l, 1l); + return do_proc_doulongvec_minmax(table, write, buffer, lenp, ppos, 1l, 1l); } /** * proc_doulongvec_ms_jiffies_minmax - read a vector of millisecond values with min/max values * @table: the sysctl table * @write: %TRUE if this is a write to the sysctl file - * @filp: the file structure * @buffer: the user buffer * @lenp: the size of the user buffer * @ppos: file position @@ -2717,11 +2710,10 @@ int proc_doulongvec_minmax(struct ctl_table *table, int write, struct file *filp * Returns 0 on success. */ int proc_doulongvec_ms_jiffies_minmax(struct ctl_table *table, int write, - struct file *filp, void __user *buffer, size_t *lenp, loff_t *ppos) { - return do_proc_doulongvec_minmax(table, write, filp, buffer, + return do_proc_doulongvec_minmax(table, write, buffer, lenp, ppos, HZ, 1000l); } @@ -2797,7 +2789,6 @@ static int do_proc_dointvec_ms_jiffies_conv(int *negp, unsigned long *lvalp, * proc_dointvec_jiffies - read a vector of integers as seconds * @table: the sysctl table * @write: %TRUE if this is a write to the sysctl file - * @filp: the file structure * @buffer: the user buffer * @lenp: the size of the user buffer * @ppos: file position @@ -2809,10 +2800,10 @@ static int do_proc_dointvec_ms_jiffies_conv(int *negp, unsigned long *lvalp, * * Returns 0 on success. */ -int proc_dointvec_jiffies(struct ctl_table *table, int write, struct file *filp, +int proc_dointvec_jiffies(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { - return do_proc_dointvec(table,write,filp,buffer,lenp,ppos, + return do_proc_dointvec(table,write,buffer,lenp,ppos, do_proc_dointvec_jiffies_conv,NULL); } @@ -2820,7 +2811,6 @@ int proc_dointvec_jiffies(struct ctl_table *table, int write, struct file *filp, * proc_dointvec_userhz_jiffies - read a vector of integers as 1/USER_HZ seconds * @table: the sysctl table * @write: %TRUE if this is a write to the sysctl file - * @filp: the file structure * @buffer: the user buffer * @lenp: the size of the user buffer * @ppos: pointer to the file position @@ -2832,10 +2822,10 @@ int proc_dointvec_jiffies(struct ctl_table *table, int write, struct file *filp, * * Returns 0 on success. */ -int proc_dointvec_userhz_jiffies(struct ctl_table *table, int write, struct file *filp, +int proc_dointvec_userhz_jiffies(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { - return do_proc_dointvec(table,write,filp,buffer,lenp,ppos, + return do_proc_dointvec(table,write,buffer,lenp,ppos, do_proc_dointvec_userhz_jiffies_conv,NULL); } @@ -2843,7 +2833,6 @@ int proc_dointvec_userhz_jiffies(struct ctl_table *table, int write, struct file * proc_dointvec_ms_jiffies - read a vector of integers as 1 milliseconds * @table: the sysctl table * @write: %TRUE if this is a write to the sysctl file - * @filp: the file structure * @buffer: the user buffer * @lenp: the size of the user buffer * @ppos: file position @@ -2856,14 +2845,14 @@ int proc_dointvec_userhz_jiffies(struct ctl_table *table, int write, struct file * * Returns 0 on success. */ -int proc_dointvec_ms_jiffies(struct ctl_table *table, int write, struct file *filp, +int proc_dointvec_ms_jiffies(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { - return do_proc_dointvec(table, write, filp, buffer, lenp, ppos, + return do_proc_dointvec(table, write, buffer, lenp, ppos, do_proc_dointvec_ms_jiffies_conv, NULL); } -static int proc_do_cad_pid(struct ctl_table *table, int write, struct file *filp, +static int proc_do_cad_pid(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { struct pid *new_pid; @@ -2872,7 +2861,7 @@ static int proc_do_cad_pid(struct ctl_table *table, int write, struct file *filp tmp = pid_vnr(cad_pid); - r = __do_proc_dointvec(&tmp, table, write, filp, buffer, + r = __do_proc_dointvec(&tmp, table, write, buffer, lenp, ppos, NULL, NULL); if (r || !write) return r; @@ -2887,50 +2876,49 @@ static int proc_do_cad_pid(struct ctl_table *table, int write, struct file *filp #else /* CONFIG_PROC_FS */ -int proc_dostring(struct ctl_table *table, int write, struct file *filp, +int proc_dostring(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { return -ENOSYS; } -int proc_dointvec(struct ctl_table *table, int write, struct file *filp, +int proc_dointvec(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { return -ENOSYS; } -int proc_dointvec_minmax(struct ctl_table *table, int write, struct file *filp, +int proc_dointvec_minmax(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { return -ENOSYS; } -int proc_dointvec_jiffies(struct ctl_table *table, int write, struct file *filp, +int proc_dointvec_jiffies(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { return -ENOSYS; } -int proc_dointvec_userhz_jiffies(struct ctl_table *table, int write, struct file *filp, +int proc_dointvec_userhz_jiffies(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { return -ENOSYS; } -int proc_dointvec_ms_jiffies(struct ctl_table *table, int write, struct file *filp, +int proc_dointvec_ms_jiffies(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { return -ENOSYS; } -int proc_doulongvec_minmax(struct ctl_table *table, int write, struct file *filp, +int proc_doulongvec_minmax(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { return -ENOSYS; } int proc_doulongvec_ms_jiffies_minmax(struct ctl_table *table, int write, - struct file *filp, void __user *buffer, size_t *lenp, loff_t *ppos) { diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 23df7771c937..a142579765bf 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -3015,7 +3015,7 @@ int unregister_ftrace_function(struct ftrace_ops *ops) int ftrace_enable_sysctl(struct ctl_table *table, int write, - struct file *file, void __user *buffer, size_t *lenp, + void __user *buffer, size_t *lenp, loff_t *ppos) { int ret; @@ -3025,7 +3025,7 @@ ftrace_enable_sysctl(struct ctl_table *table, int write, mutex_lock(&ftrace_lock); - ret = proc_dointvec(table, write, file, buffer, lenp, ppos); + ret = proc_dointvec(table, write, buffer, lenp, ppos); if (ret || !write || (last_ftrace_enabled == !!ftrace_enabled)) goto out; diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c index 0f6facb050a1..8504ac71e4e8 100644 --- a/kernel/trace/trace_stack.c +++ b/kernel/trace/trace_stack.c @@ -296,14 +296,14 @@ static const struct file_operations stack_trace_fops = { int stack_trace_sysctl(struct ctl_table *table, int write, - struct file *file, void __user *buffer, size_t *lenp, + void __user *buffer, size_t *lenp, loff_t *ppos) { int ret; mutex_lock(&stack_sysctl_mutex); - ret = proc_dointvec(table, write, file, buffer, lenp, ppos); + ret = proc_dointvec(table, write, buffer, lenp, ppos); if (ret || !write || (last_stack_tracer_enabled == !!stack_tracer_enabled)) diff --git a/kernel/utsname_sysctl.c b/kernel/utsname_sysctl.c index 92359cc747a7..69eae358a726 100644 --- a/kernel/utsname_sysctl.c +++ b/kernel/utsname_sysctl.c @@ -42,14 +42,14 @@ static void put_uts(ctl_table *table, int write, void *which) * Special case of dostring for the UTS structure. This has locks * to observe. Should this be in kernel/sys.c ???? */ -static int proc_do_uts_string(ctl_table *table, int write, struct file *filp, +static int proc_do_uts_string(ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { struct ctl_table uts_table; int r; memcpy(&uts_table, table, sizeof(uts_table)); uts_table.data = get_uts(table, write); - r = proc_dostring(&uts_table,write,filp,buffer,lenp, ppos); + r = proc_dostring(&uts_table,write,buffer,lenp, ppos); put_uts(table, write, uts_table.data); return r; } diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 815dbd4a6dcb..6f048fcc749c 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1537,7 +1537,7 @@ static unsigned int cpuset_mems_nr(unsigned int *array) #ifdef CONFIG_SYSCTL int hugetlb_sysctl_handler(struct ctl_table *table, int write, - struct file *file, void __user *buffer, + void __user *buffer, size_t *length, loff_t *ppos) { struct hstate *h = &default_hstate; @@ -1548,7 +1548,7 @@ int hugetlb_sysctl_handler(struct ctl_table *table, int write, table->data = &tmp; table->maxlen = sizeof(unsigned long); - proc_doulongvec_minmax(table, write, file, buffer, length, ppos); + proc_doulongvec_minmax(table, write, buffer, length, ppos); if (write) h->max_huge_pages = set_max_huge_pages(h, tmp); @@ -1557,10 +1557,10 @@ int hugetlb_sysctl_handler(struct ctl_table *table, int write, } int hugetlb_treat_movable_handler(struct ctl_table *table, int write, - struct file *file, void __user *buffer, + void __user *buffer, size_t *length, loff_t *ppos) { - proc_dointvec(table, write, file, buffer, length, ppos); + proc_dointvec(table, write, buffer, length, ppos); if (hugepages_treat_as_movable) htlb_alloc_mask = GFP_HIGHUSER_MOVABLE; else @@ -1569,7 +1569,7 @@ int hugetlb_treat_movable_handler(struct ctl_table *table, int write, } int hugetlb_overcommit_handler(struct ctl_table *table, int write, - struct file *file, void __user *buffer, + void __user *buffer, size_t *length, loff_t *ppos) { struct hstate *h = &default_hstate; @@ -1580,7 +1580,7 @@ int hugetlb_overcommit_handler(struct ctl_table *table, int write, table->data = &tmp; table->maxlen = sizeof(unsigned long); - proc_doulongvec_minmax(table, write, file, buffer, length, ppos); + proc_doulongvec_minmax(table, write, buffer, length, ppos); if (write) { spin_lock(&hugetlb_lock); diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 5f378dd58802..be197f71b096 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -155,37 +155,37 @@ static void update_completion_period(void) } int dirty_background_ratio_handler(struct ctl_table *table, int write, - struct file *filp, void __user *buffer, size_t *lenp, + void __user *buffer, size_t *lenp, loff_t *ppos) { int ret; - ret = proc_dointvec_minmax(table, write, filp, buffer, lenp, ppos); + ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); if (ret == 0 && write) dirty_background_bytes = 0; return ret; } int dirty_background_bytes_handler(struct ctl_table *table, int write, - struct file *filp, void __user *buffer, size_t *lenp, + void __user *buffer, size_t *lenp, loff_t *ppos) { int ret; - ret = proc_doulongvec_minmax(table, write, filp, buffer, lenp, ppos); + ret = proc_doulongvec_minmax(table, write, buffer, lenp, ppos); if (ret == 0 && write) dirty_background_ratio = 0; return ret; } int dirty_ratio_handler(struct ctl_table *table, int write, - struct file *filp, void __user *buffer, size_t *lenp, + void __user *buffer, size_t *lenp, loff_t *ppos) { int old_ratio = vm_dirty_ratio; int ret; - ret = proc_dointvec_minmax(table, write, filp, buffer, lenp, ppos); + ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); if (ret == 0 && write && vm_dirty_ratio != old_ratio) { update_completion_period(); vm_dirty_bytes = 0; @@ -195,13 +195,13 @@ int dirty_ratio_handler(struct ctl_table *table, int write, int dirty_bytes_handler(struct ctl_table *table, int write, - struct file *filp, void __user *buffer, size_t *lenp, + void __user *buffer, size_t *lenp, loff_t *ppos) { unsigned long old_bytes = vm_dirty_bytes; int ret; - ret = proc_doulongvec_minmax(table, write, filp, buffer, lenp, ppos); + ret = proc_doulongvec_minmax(table, write, buffer, lenp, ppos); if (ret == 0 && write && vm_dirty_bytes != old_bytes) { update_completion_period(); vm_dirty_ratio = 0; @@ -686,9 +686,9 @@ static DEFINE_TIMER(laptop_mode_wb_timer, laptop_timer_fn, 0, 0); * sysctl handler for /proc/sys/vm/dirty_writeback_centisecs */ int dirty_writeback_centisecs_handler(ctl_table *table, int write, - struct file *file, void __user *buffer, size_t *length, loff_t *ppos) + void __user *buffer, size_t *length, loff_t *ppos) { - proc_dointvec(table, write, file, buffer, length, ppos); + proc_dointvec(table, write, buffer, length, ppos); return 0; } diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 5717f27a0704..88248b3c20bb 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2373,7 +2373,7 @@ early_param("numa_zonelist_order", setup_numa_zonelist_order); * sysctl handler for numa_zonelist_order */ int numa_zonelist_order_handler(ctl_table *table, int write, - struct file *file, void __user *buffer, size_t *length, + void __user *buffer, size_t *length, loff_t *ppos) { char saved_string[NUMA_ZONELIST_ORDER_LEN]; @@ -2382,7 +2382,7 @@ int numa_zonelist_order_handler(ctl_table *table, int write, if (write) strncpy(saved_string, (char*)table->data, NUMA_ZONELIST_ORDER_LEN); - ret = proc_dostring(table, write, file, buffer, length, ppos); + ret = proc_dostring(table, write, buffer, length, ppos); if (ret) return ret; if (write) { @@ -4706,9 +4706,9 @@ module_init(init_per_zone_wmark_min) * changes. */ int min_free_kbytes_sysctl_handler(ctl_table *table, int write, - struct file *file, void __user *buffer, size_t *length, loff_t *ppos) + void __user *buffer, size_t *length, loff_t *ppos) { - proc_dointvec(table, write, file, buffer, length, ppos); + proc_dointvec(table, write, buffer, length, ppos); if (write) setup_per_zone_wmarks(); return 0; @@ -4716,12 +4716,12 @@ int min_free_kbytes_sysctl_handler(ctl_table *table, int write, #ifdef CONFIG_NUMA int sysctl_min_unmapped_ratio_sysctl_handler(ctl_table *table, int write, - struct file *file, void __user *buffer, size_t *length, loff_t *ppos) + void __user *buffer, size_t *length, loff_t *ppos) { struct zone *zone; int rc; - rc = proc_dointvec_minmax(table, write, file, buffer, length, ppos); + rc = proc_dointvec_minmax(table, write, buffer, length, ppos); if (rc) return rc; @@ -4732,12 +4732,12 @@ int sysctl_min_unmapped_ratio_sysctl_handler(ctl_table *table, int write, } int sysctl_min_slab_ratio_sysctl_handler(ctl_table *table, int write, - struct file *file, void __user *buffer, size_t *length, loff_t *ppos) + void __user *buffer, size_t *length, loff_t *ppos) { struct zone *zone; int rc; - rc = proc_dointvec_minmax(table, write, file, buffer, length, ppos); + rc = proc_dointvec_minmax(table, write, buffer, length, ppos); if (rc) return rc; @@ -4758,9 +4758,9 @@ int sysctl_min_slab_ratio_sysctl_handler(ctl_table *table, int write, * if in function of the boot time zone sizes. */ int lowmem_reserve_ratio_sysctl_handler(ctl_table *table, int write, - struct file *file, void __user *buffer, size_t *length, loff_t *ppos) + void __user *buffer, size_t *length, loff_t *ppos) { - proc_dointvec_minmax(table, write, file, buffer, length, ppos); + proc_dointvec_minmax(table, write, buffer, length, ppos); setup_per_zone_lowmem_reserve(); return 0; } @@ -4772,13 +4772,13 @@ int lowmem_reserve_ratio_sysctl_handler(ctl_table *table, int write, */ int percpu_pagelist_fraction_sysctl_handler(ctl_table *table, int write, - struct file *file, void __user *buffer, size_t *length, loff_t *ppos) + void __user *buffer, size_t *length, loff_t *ppos) { struct zone *zone; unsigned int cpu; int ret; - ret = proc_dointvec_minmax(table, write, file, buffer, length, ppos); + ret = proc_dointvec_minmax(table, write, buffer, length, ppos); if (!write || (ret == -EINVAL)) return ret; for_each_populated_zone(zone) { diff --git a/mm/vmscan.c b/mm/vmscan.c index 2423782214ab..f444b7409085 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2844,10 +2844,10 @@ static void scan_all_zones_unevictable_pages(void) unsigned long scan_unevictable_pages; int scan_unevictable_handler(struct ctl_table *table, int write, - struct file *file, void __user *buffer, + void __user *buffer, size_t *length, loff_t *ppos) { - proc_doulongvec_minmax(table, write, file, buffer, length, ppos); + proc_doulongvec_minmax(table, write, buffer, length, ppos); if (write && *(unsigned long *)table->data) scan_all_zones_unevictable_pages(); diff --git a/net/bridge/br_netfilter.c b/net/bridge/br_netfilter.c index 907a82e9023d..a16a2342f6bf 100644 --- a/net/bridge/br_netfilter.c +++ b/net/bridge/br_netfilter.c @@ -965,12 +965,12 @@ static struct nf_hook_ops br_nf_ops[] __read_mostly = { #ifdef CONFIG_SYSCTL static -int brnf_sysctl_call_tables(ctl_table * ctl, int write, struct file *filp, +int brnf_sysctl_call_tables(ctl_table * ctl, int write, void __user * buffer, size_t * lenp, loff_t * ppos) { int ret; - ret = proc_dointvec(ctl, write, filp, buffer, lenp, ppos); + ret = proc_dointvec(ctl, write, buffer, lenp, ppos); if (write && *(int *)(ctl->data)) *(int *)(ctl->data) = 1; diff --git a/net/decnet/dn_dev.c b/net/decnet/dn_dev.c index 1c6a5bb6f0c8..6e1f085db06a 100644 --- a/net/decnet/dn_dev.c +++ b/net/decnet/dn_dev.c @@ -164,7 +164,7 @@ static int max_t3[] = { 8191 }; /* Must fit in 16 bits when multiplied by BCT3MU static int min_priority[1]; static int max_priority[] = { 127 }; /* From DECnet spec */ -static int dn_forwarding_proc(ctl_table *, int, struct file *, +static int dn_forwarding_proc(ctl_table *, int, void __user *, size_t *, loff_t *); static int dn_forwarding_sysctl(ctl_table *table, void __user *oldval, size_t __user *oldlenp, @@ -274,7 +274,6 @@ static void dn_dev_sysctl_unregister(struct dn_dev_parms *parms) } static int dn_forwarding_proc(ctl_table *table, int write, - struct file *filep, void __user *buffer, size_t *lenp, loff_t *ppos) { @@ -290,7 +289,7 @@ static int dn_forwarding_proc(ctl_table *table, int write, dn_db = dev->dn_ptr; old = dn_db->parms.forwarding; - err = proc_dointvec(table, write, filep, buffer, lenp, ppos); + err = proc_dointvec(table, write, buffer, lenp, ppos); if ((err >= 0) && write) { if (dn_db->parms.forwarding < 0) diff --git a/net/decnet/sysctl_net_decnet.c b/net/decnet/sysctl_net_decnet.c index 5bcd592ae6dd..26b0ab1e9f56 100644 --- a/net/decnet/sysctl_net_decnet.c +++ b/net/decnet/sysctl_net_decnet.c @@ -165,7 +165,6 @@ static int dn_node_address_strategy(ctl_table *table, } static int dn_node_address_handler(ctl_table *table, int write, - struct file *filp, void __user *buffer, size_t *lenp, loff_t *ppos) { @@ -276,7 +275,6 @@ static int dn_def_dev_strategy(ctl_table *table, static int dn_def_dev_handler(ctl_table *table, int write, - struct file * filp, void __user *buffer, size_t *lenp, loff_t *ppos) { diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c index 07336c6201f0..e92f1fd28aa5 100644 --- a/net/ipv4/devinet.c +++ b/net/ipv4/devinet.c @@ -1270,10 +1270,10 @@ static void inet_forward_change(struct net *net) } static int devinet_conf_proc(ctl_table *ctl, int write, - struct file *filp, void __user *buffer, + void __user *buffer, size_t *lenp, loff_t *ppos) { - int ret = proc_dointvec(ctl, write, filp, buffer, lenp, ppos); + int ret = proc_dointvec(ctl, write, buffer, lenp, ppos); if (write) { struct ipv4_devconf *cnf = ctl->extra1; @@ -1342,12 +1342,12 @@ static int devinet_conf_sysctl(ctl_table *table, } static int devinet_sysctl_forward(ctl_table *ctl, int write, - struct file *filp, void __user *buffer, + void __user *buffer, size_t *lenp, loff_t *ppos) { int *valp = ctl->data; int val = *valp; - int ret = proc_dointvec(ctl, write, filp, buffer, lenp, ppos); + int ret = proc_dointvec(ctl, write, buffer, lenp, ppos); if (write && *valp != val) { struct net *net = ctl->extra2; @@ -1372,12 +1372,12 @@ static int devinet_sysctl_forward(ctl_table *ctl, int write, } int ipv4_doint_and_flush(ctl_table *ctl, int write, - struct file *filp, void __user *buffer, + void __user *buffer, size_t *lenp, loff_t *ppos) { int *valp = ctl->data; int val = *valp; - int ret = proc_dointvec(ctl, write, filp, buffer, lenp, ppos); + int ret = proc_dointvec(ctl, write, buffer, lenp, ppos); struct net *net = ctl->extra2; if (write && *valp != val) diff --git a/net/ipv4/route.c b/net/ipv4/route.c index df9347314538..bb4199252026 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -3036,7 +3036,7 @@ void ip_rt_multicast_event(struct in_device *in_dev) #ifdef CONFIG_SYSCTL static int ipv4_sysctl_rtcache_flush(ctl_table *__ctl, int write, - struct file *filp, void __user *buffer, + void __user *buffer, size_t *lenp, loff_t *ppos) { if (write) { @@ -3046,7 +3046,7 @@ static int ipv4_sysctl_rtcache_flush(ctl_table *__ctl, int write, memcpy(&ctl, __ctl, sizeof(ctl)); ctl.data = &flush_delay; - proc_dointvec(&ctl, write, filp, buffer, lenp, ppos); + proc_dointvec(&ctl, write, buffer, lenp, ppos); net = (struct net *)__ctl->extra1; rt_cache_flush(net, flush_delay); @@ -3106,12 +3106,11 @@ static void rt_secret_reschedule(int old) } static int ipv4_sysctl_rt_secret_interval(ctl_table *ctl, int write, - struct file *filp, void __user *buffer, size_t *lenp, loff_t *ppos) { int old = ip_rt_secret_interval; - int ret = proc_dointvec_jiffies(ctl, write, filp, buffer, lenp, ppos); + int ret = proc_dointvec_jiffies(ctl, write, buffer, lenp, ppos); rt_secret_reschedule(old); diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 4710d219f06a..2dcf04d9b005 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -36,7 +36,7 @@ static void set_local_port_range(int range[2]) } /* Validate changes from /proc interface. */ -static int ipv4_local_port_range(ctl_table *table, int write, struct file *filp, +static int ipv4_local_port_range(ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { @@ -51,7 +51,7 @@ static int ipv4_local_port_range(ctl_table *table, int write, struct file *filp, }; inet_get_local_port_range(range, range + 1); - ret = proc_dointvec_minmax(&tmp, write, filp, buffer, lenp, ppos); + ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos); if (write && ret == 0) { if (range[1] < range[0]) @@ -91,7 +91,7 @@ static int ipv4_sysctl_local_port_range(ctl_table *table, } -static int proc_tcp_congestion_control(ctl_table *ctl, int write, struct file * filp, +static int proc_tcp_congestion_control(ctl_table *ctl, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { char val[TCP_CA_NAME_MAX]; @@ -103,7 +103,7 @@ static int proc_tcp_congestion_control(ctl_table *ctl, int write, struct file * tcp_get_default_congestion_control(val); - ret = proc_dostring(&tbl, write, filp, buffer, lenp, ppos); + ret = proc_dostring(&tbl, write, buffer, lenp, ppos); if (write && ret == 0) ret = tcp_set_default_congestion_control(val); return ret; @@ -129,7 +129,7 @@ static int sysctl_tcp_congestion_control(ctl_table *table, } static int proc_tcp_available_congestion_control(ctl_table *ctl, - int write, struct file * filp, + int write, void __user *buffer, size_t *lenp, loff_t *ppos) { @@ -140,13 +140,13 @@ static int proc_tcp_available_congestion_control(ctl_table *ctl, if (!tbl.data) return -ENOMEM; tcp_get_available_congestion_control(tbl.data, TCP_CA_BUF_MAX); - ret = proc_dostring(&tbl, write, filp, buffer, lenp, ppos); + ret = proc_dostring(&tbl, write, buffer, lenp, ppos); kfree(tbl.data); return ret; } static int proc_allowed_congestion_control(ctl_table *ctl, - int write, struct file * filp, + int write, void __user *buffer, size_t *lenp, loff_t *ppos) { @@ -158,7 +158,7 @@ static int proc_allowed_congestion_control(ctl_table *ctl, return -ENOMEM; tcp_get_allowed_congestion_control(tbl.data, tbl.maxlen); - ret = proc_dostring(&tbl, write, filp, buffer, lenp, ppos); + ret = proc_dostring(&tbl, write, buffer, lenp, ppos); if (write && ret == 0) ret = tcp_set_allowed_congestion_control(tbl.data); kfree(tbl.data); diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 55f486d89c88..1fd0a3d775d2 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -3986,14 +3986,14 @@ static void ipv6_ifa_notify(int event, struct inet6_ifaddr *ifp) #ifdef CONFIG_SYSCTL static -int addrconf_sysctl_forward(ctl_table *ctl, int write, struct file * filp, +int addrconf_sysctl_forward(ctl_table *ctl, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { int *valp = ctl->data; int val = *valp; int ret; - ret = proc_dointvec(ctl, write, filp, buffer, lenp, ppos); + ret = proc_dointvec(ctl, write, buffer, lenp, ppos); if (write) ret = addrconf_fixup_forwarding(ctl, valp, val); @@ -4090,14 +4090,14 @@ static int addrconf_disable_ipv6(struct ctl_table *table, int *p, int old) } static -int addrconf_sysctl_disable(ctl_table *ctl, int write, struct file * filp, +int addrconf_sysctl_disable(ctl_table *ctl, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { int *valp = ctl->data; int val = *valp; int ret; - ret = proc_dointvec(ctl, write, filp, buffer, lenp, ppos); + ret = proc_dointvec(ctl, write, buffer, lenp, ppos); if (write) ret = addrconf_disable_ipv6(ctl, valp, val); diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c index 7015478797f6..498b9b0b0fad 100644 --- a/net/ipv6/ndisc.c +++ b/net/ipv6/ndisc.c @@ -1735,7 +1735,7 @@ static void ndisc_warn_deprecated_sysctl(struct ctl_table *ctl, } } -int ndisc_ifinfo_sysctl_change(struct ctl_table *ctl, int write, struct file * filp, void __user *buffer, size_t *lenp, loff_t *ppos) +int ndisc_ifinfo_sysctl_change(struct ctl_table *ctl, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { struct net_device *dev = ctl->extra1; struct inet6_dev *idev; @@ -1746,16 +1746,16 @@ int ndisc_ifinfo_sysctl_change(struct ctl_table *ctl, int write, struct file * f ndisc_warn_deprecated_sysctl(ctl, "syscall", dev ? dev->name : "default"); if (strcmp(ctl->procname, "retrans_time") == 0) - ret = proc_dointvec(ctl, write, filp, buffer, lenp, ppos); + ret = proc_dointvec(ctl, write, buffer, lenp, ppos); else if (strcmp(ctl->procname, "base_reachable_time") == 0) ret = proc_dointvec_jiffies(ctl, write, - filp, buffer, lenp, ppos); + buffer, lenp, ppos); else if ((strcmp(ctl->procname, "retrans_time_ms") == 0) || (strcmp(ctl->procname, "base_reachable_time_ms") == 0)) ret = proc_dointvec_ms_jiffies(ctl, write, - filp, buffer, lenp, ppos); + buffer, lenp, ppos); else ret = -1; diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 77aecbe8ff6c..d6fe7646a8ff 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -2524,13 +2524,13 @@ static const struct file_operations rt6_stats_seq_fops = { #ifdef CONFIG_SYSCTL static -int ipv6_sysctl_rtcache_flush(ctl_table *ctl, int write, struct file * filp, +int ipv6_sysctl_rtcache_flush(ctl_table *ctl, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { struct net *net = current->nsproxy->net_ns; int delay = net->ipv6.sysctl.flush_delay; if (write) { - proc_dointvec(ctl, write, filp, buffer, lenp, ppos); + proc_dointvec(ctl, write, buffer, lenp, ppos); fib6_run_gc(delay <= 0 ? ~0UL : (unsigned long)delay, net); return 0; } else diff --git a/net/irda/irsysctl.c b/net/irda/irsysctl.c index 57f8817c3979..5c86567e5a78 100644 --- a/net/irda/irsysctl.c +++ b/net/irda/irsysctl.c @@ -73,12 +73,12 @@ static int min_lap_keepalive_time = 100; /* 100us */ /* For other sysctl, I've no idea of the range. Maybe Dag could help * us on that - Jean II */ -static int do_devname(ctl_table *table, int write, struct file *filp, +static int do_devname(ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { int ret; - ret = proc_dostring(table, write, filp, buffer, lenp, ppos); + ret = proc_dostring(table, write, buffer, lenp, ppos); if (ret == 0 && write) { struct ias_value *val; @@ -90,12 +90,12 @@ static int do_devname(ctl_table *table, int write, struct file *filp, } -static int do_discovery(ctl_table *table, int write, struct file *filp, +static int do_discovery(ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { int ret; - ret = proc_dointvec(table, write, filp, buffer, lenp, ppos); + ret = proc_dointvec(table, write, buffer, lenp, ppos); if (ret) return ret; diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c index fba2892b99e1..446e9bd4b4bc 100644 --- a/net/netfilter/ipvs/ip_vs_ctl.c +++ b/net/netfilter/ipvs/ip_vs_ctl.c @@ -1496,14 +1496,14 @@ static int ip_vs_zero_all(void) static int -proc_do_defense_mode(ctl_table *table, int write, struct file * filp, +proc_do_defense_mode(ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { int *valp = table->data; int val = *valp; int rc; - rc = proc_dointvec(table, write, filp, buffer, lenp, ppos); + rc = proc_dointvec(table, write, buffer, lenp, ppos); if (write && (*valp != val)) { if ((*valp < 0) || (*valp > 3)) { /* Restore the correct value */ @@ -1517,7 +1517,7 @@ proc_do_defense_mode(ctl_table *table, int write, struct file * filp, static int -proc_do_sync_threshold(ctl_table *table, int write, struct file *filp, +proc_do_sync_threshold(ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { int *valp = table->data; @@ -1527,7 +1527,7 @@ proc_do_sync_threshold(ctl_table *table, int write, struct file *filp, /* backup the value first */ memcpy(val, valp, sizeof(val)); - rc = proc_dointvec(table, write, filp, buffer, lenp, ppos); + rc = proc_dointvec(table, write, buffer, lenp, ppos); if (write && (valp[0] < 0 || valp[1] < 0 || valp[0] >= valp[1])) { /* Restore the correct value */ memcpy(valp, val, sizeof(val)); diff --git a/net/netfilter/nf_log.c b/net/netfilter/nf_log.c index 4e620305f28c..c93494fef8ef 100644 --- a/net/netfilter/nf_log.c +++ b/net/netfilter/nf_log.c @@ -226,7 +226,7 @@ static char nf_log_sysctl_fnames[NFPROTO_NUMPROTO-NFPROTO_UNSPEC][3]; static struct ctl_table nf_log_sysctl_table[NFPROTO_NUMPROTO+1]; static struct ctl_table_header *nf_log_dir_header; -static int nf_log_proc_dostring(ctl_table *table, int write, struct file *filp, +static int nf_log_proc_dostring(ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { const struct nf_logger *logger; @@ -260,7 +260,7 @@ static int nf_log_proc_dostring(ctl_table *table, int write, struct file *filp, table->data = "NONE"; else table->data = logger->name; - r = proc_dostring(table, write, filp, buffer, lenp, ppos); + r = proc_dostring(table, write, buffer, lenp, ppos); mutex_unlock(&nf_log_mutex); } diff --git a/net/phonet/sysctl.c b/net/phonet/sysctl.c index 7b5749ee2765..2220f3322326 100644 --- a/net/phonet/sysctl.c +++ b/net/phonet/sysctl.c @@ -56,7 +56,7 @@ void phonet_get_local_port_range(int *min, int *max) } while (read_seqretry(&local_port_range_lock, seq)); } -static int proc_local_port_range(ctl_table *table, int write, struct file *filp, +static int proc_local_port_range(ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { @@ -70,7 +70,7 @@ static int proc_local_port_range(ctl_table *table, int write, struct file *filp, .extra2 = &local_port_range_max, }; - ret = proc_dointvec_minmax(&tmp, write, filp, buffer, lenp, ppos); + ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos); if (write && ret == 0) { if (range[1] < range[0]) diff --git a/net/sunrpc/sysctl.c b/net/sunrpc/sysctl.c index 5231f7aaac0e..42f9748ae093 100644 --- a/net/sunrpc/sysctl.c +++ b/net/sunrpc/sysctl.c @@ -56,7 +56,7 @@ rpc_unregister_sysctl(void) } } -static int proc_do_xprt(ctl_table *table, int write, struct file *file, +static int proc_do_xprt(ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { char tmpbuf[256]; @@ -71,7 +71,7 @@ static int proc_do_xprt(ctl_table *table, int write, struct file *file, } static int -proc_dodebug(ctl_table *table, int write, struct file *file, +proc_dodebug(ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { char tmpbuf[20], c, *s; diff --git a/net/sunrpc/xprtrdma/svc_rdma.c b/net/sunrpc/xprtrdma/svc_rdma.c index 87101177825b..35fb68b9c8ec 100644 --- a/net/sunrpc/xprtrdma/svc_rdma.c +++ b/net/sunrpc/xprtrdma/svc_rdma.c @@ -80,7 +80,7 @@ struct kmem_cache *svc_rdma_ctxt_cachep; * current value. */ static int read_reset_stat(ctl_table *table, int write, - struct file *filp, void __user *buffer, size_t *lenp, + void __user *buffer, size_t *lenp, loff_t *ppos) { atomic_t *stat = (atomic_t *)table->data; diff --git a/security/min_addr.c b/security/min_addr.c index 14cc7b3b8d03..c844eed7915d 100644 --- a/security/min_addr.c +++ b/security/min_addr.c @@ -28,12 +28,12 @@ static void update_mmap_min_addr(void) * sysctl handler which just sets dac_mmap_min_addr = the new value and then * calls update_mmap_min_addr() so non MAP_FIXED hints get rounded properly */ -int mmap_min_addr_handler(struct ctl_table *table, int write, struct file *filp, +int mmap_min_addr_handler(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { int ret; - ret = proc_doulongvec_minmax(table, write, filp, buffer, lenp, ppos); + ret = proc_doulongvec_minmax(table, write, buffer, lenp, ppos); update_mmap_min_addr(); -- cgit v1.2.3 From 3e2ada5867b7e9fa0b296d30fa8f3726ebd0a8b7 Mon Sep 17 00:00:00 2001 From: Zhao Yakui Date: Sun, 27 Sep 2009 03:30:51 -0400 Subject: ACPI: fix Compaq Evo N800c (Pentium 4m) boot hang regression Don't disable ARB_DISABLE when the familary ID is 0x0F. http://bugzilla.kernel.org/show_bug.cgi?id=14211 This was a 2.6.31 regression, and so this patch needs to be applied to 2.6.31.stable Signed-off-by: Zhao Yakui Signed-off-by: Len Brown --- arch/x86/kernel/acpi/cstate.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/acpi/cstate.c b/arch/x86/kernel/acpi/cstate.c index 8c44c232efcb..59cdfa4686b2 100644 --- a/arch/x86/kernel/acpi/cstate.c +++ b/arch/x86/kernel/acpi/cstate.c @@ -48,7 +48,7 @@ void acpi_processor_power_init_bm_check(struct acpi_processor_flags *flags, * P4, Core and beyond CPUs */ if (c->x86_vendor == X86_VENDOR_INTEL && - (c->x86 > 0x6 || (c->x86 == 6 && c->x86_model >= 14))) + (c->x86 > 0xf || (c->x86 == 6 && c->x86_model >= 14))) flags->bm_control = 0; } EXPORT_SYMBOL(acpi_processor_power_init_bm_check); -- cgit v1.2.3 From e207e143e2fb6a2790b1ce3687c8aedc3ddc357b Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Wed, 30 Sep 2009 07:48:37 -0700 Subject: Revert "x86, mce: do not compile mcelog message on AMD" This reverts commit 22223c9b417be5fd0ab2cf9ad17eb7bd1e19f7b9, as requested by Andi Kleen: "Obviously kernels compiled with AMD support can still run on non AMD systems, so messages like this can never be removed at compile time." Requsted-by: Andi Kleen Cc: Borislav Petkov Signed-off-by: Linus Torvalds --- arch/x86/kernel/cpu/mcheck/mce.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 4b2af86e3e8d..183c3457d2f4 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -204,10 +204,7 @@ static void print_mce_head(void) static void print_mce_tail(void) { printk(KERN_EMERG "This is not a software problem!\n" -#if (!defined(CONFIG_EDAC) || !defined(CONFIG_CPU_SUP_AMD)) - "Run through mcelog --ascii to decode and contact your hardware vendor\n" -#endif - ); + "Run through mcelog --ascii to decode and contact your hardware vendor\n"); } #define PANIC_TIMEOUT 5 /* 5 seconds */ -- cgit v1.2.3 From 79e1dd05d1a22e95ab6d54d21836f478b3b56976 Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Wed, 30 Sep 2009 17:07:54 +0200 Subject: x86: Provide an alternative() based cmpxchg64() cmpxchg64() today generates, to quote Linus, "barf bag" code. cmpxchg64() is about to get used in the scheduler to fix a bug there, but it's a prerequisite that cmpxchg64() first be made non-sucking. This patch turns cmpxchg64() into an efficient implementation that uses the alternative() mechanism to just use the raw instruction on all modern systems. Note: the fallback is NOT smp safe, just like the current fallback is not SMP safe. (Interested parties with i486 based SMP systems are welcome to submit fix patches for that.) Signed-off-by: Arjan van de Ven Acked-by: Linus Torvalds [ fixed asm constraint bug ] Fixed-by: Eric Dumazet Cc: Martin Schwidefsky Cc: John Stultz Cc: Peter Zijlstra LKML-Reference: <20090930170754.0886ff2e@infradead.org> Signed-off-by: Ingo Molnar --- arch/x86/include/asm/cmpxchg_32.h | 30 ++++++++++++--------- arch/x86/kernel/i386_ksyms_32.c | 8 ++++++ arch/x86/lib/Makefile | 2 +- arch/x86/lib/cmpxchg8b_emu.S | 57 +++++++++++++++++++++++++++++++++++++++ 4 files changed, 83 insertions(+), 14 deletions(-) create mode 100644 arch/x86/lib/cmpxchg8b_emu.S (limited to 'arch/x86/kernel') diff --git a/arch/x86/include/asm/cmpxchg_32.h b/arch/x86/include/asm/cmpxchg_32.h index 82ceb788a981..ee1931be6593 100644 --- a/arch/x86/include/asm/cmpxchg_32.h +++ b/arch/x86/include/asm/cmpxchg_32.h @@ -312,19 +312,23 @@ static inline unsigned long cmpxchg_386(volatile void *ptr, unsigned long old, extern unsigned long long cmpxchg_486_u64(volatile void *, u64, u64); -#define cmpxchg64(ptr, o, n) \ -({ \ - __typeof__(*(ptr)) __ret; \ - if (likely(boot_cpu_data.x86 > 4)) \ - __ret = (__typeof__(*(ptr)))__cmpxchg64((ptr), \ - (unsigned long long)(o), \ - (unsigned long long)(n)); \ - else \ - __ret = (__typeof__(*(ptr)))cmpxchg_486_u64((ptr), \ - (unsigned long long)(o), \ - (unsigned long long)(n)); \ - __ret; \ -}) +#define cmpxchg64(ptr, o, n) \ +({ \ + __typeof__(*(ptr)) __ret; \ + __typeof__(*(ptr)) __old = (o); \ + __typeof__(*(ptr)) __new = (n); \ + alternative_io("call cmpxchg8b_emu", \ + "lock; cmpxchg8b (%%esi)" , \ + X86_FEATURE_CX8, \ + "=A" (__ret), \ + "S" ((ptr)), "0" (__old), \ + "b" ((unsigned int)__new), \ + "c" ((unsigned int)(__new>>32)) \ + : "memory"); \ + __ret; }) + + + #define cmpxchg64_local(ptr, o, n) \ ({ \ __typeof__(*(ptr)) __ret; \ diff --git a/arch/x86/kernel/i386_ksyms_32.c b/arch/x86/kernel/i386_ksyms_32.c index 43cec6bdda63..1736c5a725aa 100644 --- a/arch/x86/kernel/i386_ksyms_32.c +++ b/arch/x86/kernel/i386_ksyms_32.c @@ -10,6 +10,14 @@ EXPORT_SYMBOL(mcount); #endif +/* + * Note, this is a prototype to get at the symbol for + * the export, but dont use it from C code, it is used + * by assembly code and is not using C calling convention! + */ +extern void cmpxchg8b_emu(void); +EXPORT_SYMBOL(cmpxchg8b_emu); + /* Networking helper routines. */ EXPORT_SYMBOL(csum_partial_copy_generic); diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile index 9e609206fac9..3e549b8ec8c9 100644 --- a/arch/x86/lib/Makefile +++ b/arch/x86/lib/Makefile @@ -15,7 +15,7 @@ ifeq ($(CONFIG_X86_32),y) obj-y += atomic64_32.o lib-y += checksum_32.o lib-y += strstr_32.o - lib-y += semaphore_32.o string_32.o + lib-y += semaphore_32.o string_32.o cmpxchg8b_emu.o lib-$(CONFIG_X86_USE_3DNOW) += mmx_32.o else diff --git a/arch/x86/lib/cmpxchg8b_emu.S b/arch/x86/lib/cmpxchg8b_emu.S new file mode 100644 index 000000000000..828cb710dec2 --- /dev/null +++ b/arch/x86/lib/cmpxchg8b_emu.S @@ -0,0 +1,57 @@ +/* + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; version 2 + * of the License. + * + */ + +#include +#include +#include +#include + + +.text + +/* + * Inputs: + * %esi : memory location to compare + * %eax : low 32 bits of old value + * %edx : high 32 bits of old value + * %ebx : low 32 bits of new value + * %ecx : high 32 bits of new value + */ +ENTRY(cmpxchg8b_emu) +CFI_STARTPROC + +# +# Emulate 'cmpxchg8b (%esi)' on UP except we don't +# set the whole ZF thing (caller will just compare +# eax:edx with the expected value) +# +cmpxchg8b_emu: + pushfl + cli + + cmpl (%esi), %eax + jne not_same + cmpl 4(%esi), %edx + jne half_same + + movl %ebx, (%esi) + movl %ecx, 4(%esi) + + popfl + ret + + not_same: + movl (%esi), %eax + half_same: + movl 4(%esi), %edx + + popfl + ret + +CFI_ENDPROC +ENDPROC(cmpxchg8b_emu) -- cgit v1.2.3 From 04edbdef02ec4386a2ae38cab7ae7d166e17a756 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 1 Oct 2009 07:30:38 +0200 Subject: x86: Don't generate cmpxchg8b_emu if CONFIG_X86_CMPXCHG64=y Conditionaly compile cmpxchg8b_emu.o and EXPORT_SYMBOL(cmpxchg8b_emu). This reduces the kernel size a bit. Signed-off-by: Eric Dumazet Cc: Arjan van de Ven Cc: Martin Schwidefsky Cc: John Stultz Cc: Peter Zijlstra Cc: Linus Torvalds LKML-Reference: <4AC43E7E.1000600@gmail.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/i386_ksyms_32.c | 2 ++ arch/x86/lib/Makefile | 6 ++++-- 2 files changed, 6 insertions(+), 2 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/i386_ksyms_32.c b/arch/x86/kernel/i386_ksyms_32.c index 1736c5a725aa..9c3bd4a2050e 100644 --- a/arch/x86/kernel/i386_ksyms_32.c +++ b/arch/x86/kernel/i386_ksyms_32.c @@ -15,8 +15,10 @@ EXPORT_SYMBOL(mcount); * the export, but dont use it from C code, it is used * by assembly code and is not using C calling convention! */ +#ifndef CONFIG_X86_CMPXCHG64 extern void cmpxchg8b_emu(void); EXPORT_SYMBOL(cmpxchg8b_emu); +#endif /* Networking helper routines. */ EXPORT_SYMBOL(csum_partial_copy_generic); diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile index 3e549b8ec8c9..85f5db95c60f 100644 --- a/arch/x86/lib/Makefile +++ b/arch/x86/lib/Makefile @@ -15,8 +15,10 @@ ifeq ($(CONFIG_X86_32),y) obj-y += atomic64_32.o lib-y += checksum_32.o lib-y += strstr_32.o - lib-y += semaphore_32.o string_32.o cmpxchg8b_emu.o - + lib-y += semaphore_32.o string_32.o +ifneq ($(CONFIG_X86_CMPXCHG64),y) + lib-y += cmpxchg8b_emu.o +endif lib-$(CONFIG_X86_USE_3DNOW) += mmx_32.o else obj-y += io_64.o iomap_copy_64.o -- cgit v1.2.3 From ea3acb199a5d7e4da1de0a4288eba993b29f33b9 Mon Sep 17 00:00:00 2001 From: Jason Wessel Date: Thu, 24 Sep 2009 09:08:30 -0500 Subject: x86: earlyprintk: Fix regression to handle serial,ttySn as 1 arg Commit c953094 ("early_printk: Allow more than one early console") introduced a regression in the parsing of the earlyprintk= kernel arguments. If you specify "earlyprintk=serial,ttyS0,115200" as a kernel argument, the "serial,ttyS" should be parsed as a single argument and not as "serial" and then "ttyS". Also update the documentation to reflect you can specify the ttyS directly without the "serial" argument. Signed-off-by: Jason Wessel Cc: Len Brown Cc: Greg KH Cc: Linus Torvalds Cc: Andrew Morton Cc: Johannes Weiner LKML-Reference: <4ABB7D5E.6000301@windriver.com> Signed-off-by: Ingo Molnar --- Documentation/kernel-parameters.txt | 1 + arch/x86/kernel/early_printk.c | 5 ++++- 2 files changed, 5 insertions(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index 6fa7292947e5..9107b387e91f 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -671,6 +671,7 @@ and is between 256 and 4096 characters. It is defined in the file earlyprintk= [X86,SH,BLACKFIN] earlyprintk=vga earlyprintk=serial[,ttySn[,baudrate]] + earlyprintk=ttySn[,baudrate] earlyprintk=dbgp[debugController#] Append ",keep" to not disable it when the real console diff --git a/arch/x86/kernel/early_printk.c b/arch/x86/kernel/early_printk.c index 41fd965c80c6..b9c830c12b4a 100644 --- a/arch/x86/kernel/early_printk.c +++ b/arch/x86/kernel/early_printk.c @@ -206,8 +206,11 @@ static int __init setup_early_printk(char *buf) while (*buf != '\0') { if (!strncmp(buf, "serial", 6)) { - early_serial_init(buf + 6); + buf += 6; + early_serial_init(buf); early_console_register(&early_serial_console, keep); + if (!strncmp(buf, ",ttyS", 5)) + buf += 5; } if (!strncmp(buf, "ttyS", 4)) { early_serial_init(buf + 4); -- cgit v1.2.3 From f436f8bb73138bc74eb1c6527723e00988ad8a8a Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 1 Oct 2009 16:14:32 +0200 Subject: x86: EDAC: MCE: Fix MCE decoding callback logic Make decoding of MCEs happen only on AMD hardware by registering a non-default callback only on CPU families which support it. While looking at the interaction of decode_mce() with the other MCE code i also noticed a few other things and made the following cleanups/fixes: - Fixed the mce_decode() weak alias - a weak alias is really not good here, it should be a proper callback. A weak alias will be overriden if a piece of code is built into the kernel - not good, obviously. - The patch initializes the callback on AMD family 10h and 11h. - Added the more correct fallback printk of: No support for human readable MCE decoding on this CPU type. Transcribe the message and run it through 'mcelog --ascii' to decode. On CPUs that dont have a decoder. - Made the surrounding code more readable. Note that the callback allows us to have a default fallback - without having to check the CPU versions during the printout itself. When an EDAC module registers itself, it can install the decode-print function. (there's no unregister needed as this is core code.) version -v2 by Borislav Petkov: - add K8 to the set of supported CPUs - always build in edac_mce_amd since we use an early_initcall now - fix checkpatch warnings Signed-off-by: Borislav Petkov Cc: Linus Torvalds Cc: Andi Kleen LKML-Reference: <20091001141432.GA11410@aftab> Signed-off-by: Ingo Molnar --- arch/x86/include/asm/mce.h | 2 ++ arch/x86/kernel/cpu/mcheck/mce.c | 58 +++++++++++++++++++++++++--------------- drivers/edac/Makefile | 2 +- drivers/edac/edac_mce_amd.c | 15 ++++++++++- 4 files changed, 53 insertions(+), 24 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h index b608a64c5814..f1363b72364f 100644 --- a/arch/x86/include/asm/mce.h +++ b/arch/x86/include/asm/mce.h @@ -133,6 +133,8 @@ static inline void winchip_mcheck_init(struct cpuinfo_x86 *c) {} static inline void enable_p5_mce(void) {} #endif +extern void (*x86_mce_decode_callback)(struct mce *m); + void mce_setup(struct mce *m); void mce_log(struct mce *m); DECLARE_PER_CPU(struct sys_device, mce_dev); diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 183c3457d2f4..b1598a9436d0 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -85,6 +85,18 @@ static DECLARE_WAIT_QUEUE_HEAD(mce_wait); static DEFINE_PER_CPU(struct mce, mces_seen); static int cpu_missing; +static void default_decode_mce(struct mce *m) +{ + pr_emerg("No human readable MCE decoding support on this CPU type.\n"); + pr_emerg("Run the message through 'mcelog --ascii' to decode.\n"); +} + +/* + * CPU/chipset specific EDAC code can register a callback here to print + * MCE errors in a human-readable form: + */ +void (*x86_mce_decode_callback)(struct mce *m) = default_decode_mce; +EXPORT_SYMBOL(x86_mce_decode_callback); /* MCA banks polled by the period polling timer for corrected events */ DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = { @@ -165,46 +177,46 @@ void mce_log(struct mce *mce) set_bit(0, &mce_need_notify); } -void __weak decode_mce(struct mce *m) -{ - return; -} - static void print_mce(struct mce *m) { - printk(KERN_EMERG - "CPU %d: Machine Check Exception: %16Lx Bank %d: %016Lx\n", + pr_emerg("CPU %d: Machine Check Exception: %16Lx Bank %d: %016Lx\n", m->extcpu, m->mcgstatus, m->bank, m->status); + if (m->ip) { - printk(KERN_EMERG "RIP%s %02x:<%016Lx> ", - !(m->mcgstatus & MCG_STATUS_EIPV) ? " !INEXACT!" : "", - m->cs, m->ip); + pr_emerg("RIP%s %02x:<%016Lx> ", + !(m->mcgstatus & MCG_STATUS_EIPV) ? " !INEXACT!" : "", + m->cs, m->ip); + if (m->cs == __KERNEL_CS) print_symbol("{%s}", m->ip); - printk(KERN_CONT "\n"); + pr_cont("\n"); } - printk(KERN_EMERG "TSC %llx ", m->tsc); + + pr_emerg("TSC %llx ", m->tsc); if (m->addr) - printk(KERN_CONT "ADDR %llx ", m->addr); + pr_cont("ADDR %llx ", m->addr); if (m->misc) - printk(KERN_CONT "MISC %llx ", m->misc); - printk(KERN_CONT "\n"); - printk(KERN_EMERG "PROCESSOR %u:%x TIME %llu SOCKET %u APIC %x\n", - m->cpuvendor, m->cpuid, m->time, m->socketid, - m->apicid); + pr_cont("MISC %llx ", m->misc); + + pr_cont("\n"); + pr_emerg("PROCESSOR %u:%x TIME %llu SOCKET %u APIC %x\n", + m->cpuvendor, m->cpuid, m->time, m->socketid, m->apicid); - decode_mce(m); + /* + * Print out human-readable details about the MCE error, + * (if the CPU has an implementation for that): + */ + x86_mce_decode_callback(m); } static void print_mce_head(void) { - printk(KERN_EMERG "\nHARDWARE ERROR\n"); + pr_emerg("\nHARDWARE ERROR\n"); } static void print_mce_tail(void) { - printk(KERN_EMERG "This is not a software problem!\n" - "Run through mcelog --ascii to decode and contact your hardware vendor\n"); + pr_emerg("This is not a software problem!\n"); } #define PANIC_TIMEOUT 5 /* 5 seconds */ @@ -218,6 +230,7 @@ static atomic_t mce_fake_paniced; static void wait_for_panic(void) { long timeout = PANIC_TIMEOUT*USEC_PER_SEC; + preempt_disable(); local_irq_enable(); while (timeout-- > 0) @@ -285,6 +298,7 @@ static void mce_panic(char *msg, struct mce *final, char *exp) static int msr_to_offset(u32 msr) { unsigned bank = __get_cpu_var(injectm.bank); + if (msr == rip_msr) return offsetof(struct mce, ip); if (msr == MSR_IA32_MCx_STATUS(bank)) diff --git a/drivers/edac/Makefile b/drivers/edac/Makefile index 7a473bbe8abd..8701cd7ce4e3 100644 --- a/drivers/edac/Makefile +++ b/drivers/edac/Makefile @@ -18,7 +18,7 @@ edac_core-objs += edac_pci.o edac_pci_sysfs.o endif ifdef CONFIG_CPU_SUP_AMD -edac_core-objs += edac_mce_amd.o +obj-$(CONFIG_X86_MCE) += edac_mce_amd.o endif obj-$(CONFIG_EDAC_AMD76X) += amd76x_edac.o diff --git a/drivers/edac/edac_mce_amd.c b/drivers/edac/edac_mce_amd.c index 0c21c370c9dd..83a01a1187d7 100644 --- a/drivers/edac/edac_mce_amd.c +++ b/drivers/edac/edac_mce_amd.c @@ -362,7 +362,7 @@ static inline void amd_decode_err_code(unsigned int ec) pr_warning("Huh? Unknown MCE error 0x%x\n", ec); } -void decode_mce(struct mce *m) +static void amd_decode_mce(struct mce *m) { struct err_regs regs; int node, ecc; @@ -420,3 +420,16 @@ void decode_mce(struct mce *m) amd_decode_err_code(m->status & 0xffff); } + +static int __init mce_amd_init(void) +{ + /* + * We can decode MCEs for Opteron and later CPUs: + */ + if ((boot_cpu_data.x86_vendor == X86_VENDOR_AMD) && + (boot_cpu_data.x86 >= 0xf)) + x86_mce_decode_callback = amd_decode_mce; + + return 0; +} +early_initcall(mce_amd_init); -- cgit v1.2.3 From 11879ba5d9ab8174af9b9cefbb2396a54dfbf8c1 Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Sat, 26 Sep 2009 20:51:50 +0200 Subject: x86: Simplify bound checks in the MTRR code The current bound checks for copy_from_user in the MTRR driver are not as obvious as they could be, and gcc agrees with that. This patch simplifies the boundary checks to the point that gcc can now prove to itself that the copy_from_user() is never going past its bounds. Signed-off-by: Arjan van de Ven Cc: Yinghai Lu Cc: Linus Torvalds LKML-Reference: <20090926205150.30797709@infradead.org> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/mtrr/if.c | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/mtrr/if.c b/arch/x86/kernel/cpu/mtrr/if.c index f04e72527604..3c1b12d461d1 100644 --- a/arch/x86/kernel/cpu/mtrr/if.c +++ b/arch/x86/kernel/cpu/mtrr/if.c @@ -96,17 +96,24 @@ mtrr_write(struct file *file, const char __user *buf, size_t len, loff_t * ppos) unsigned long long base, size; char *ptr; char line[LINE_SIZE]; + int length; size_t linelen; if (!capable(CAP_SYS_ADMIN)) return -EPERM; - if (!len) - return -EINVAL; memset(line, 0, LINE_SIZE); - if (len > LINE_SIZE) - len = LINE_SIZE; - if (copy_from_user(line, buf, len - 1)) + + length = len; + length--; + + if (length > LINE_SIZE - 1) + length = LINE_SIZE - 1; + + if (length < 0) + return -EINVAL; + + if (copy_from_user(line, buf, length)) return -EFAULT; linelen = strlen(line); -- cgit v1.2.3 From e3be785fb59f92c0df685037062d041619653b7a Mon Sep 17 00:00:00 2001 From: Marin Mitov Date: Sat, 3 Oct 2009 20:45:02 +0300 Subject: x86, pci: Correct spelling in a comment Signed-off-by: Marin Mitov Cc: Joerg Roedel Cc: Jesse Brandeburg LKML-Reference: <200910032045.02523.mitov@issp.bas.bg> Signed-off-by: Ingo Molnar ====================================================== --- arch/x86/kernel/pci-dma.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c index 64b838eac18c..d20009b4e6ef 100644 --- a/arch/x86/kernel/pci-dma.c +++ b/arch/x86/kernel/pci-dma.c @@ -35,7 +35,7 @@ int iommu_detected __read_mostly = 0; /* * This variable becomes 1 if iommu=pt is passed on the kernel command line. - * If this variable is 1, IOMMU implementations do no DMA ranslation for + * If this variable is 1, IOMMU implementations do no DMA translation for * devices and allow every device to access to whole physical memory. This is * useful if a user want to use an IOMMU only for KVM device assignment to * guests and not for driver dma translation. -- cgit v1.2.3 From 9bcbdd9c58617f1301dd4f17c738bb9bc73aca70 Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Thu, 8 Oct 2009 06:40:41 -0700 Subject: x86, timers: Check for pending timers after (device) interrupts Now that range timers and deferred timers are common, I found a problem with these using the "perf timechart" tool. Frans Pop also reported high scheduler latencies via LatencyTop, when using iwlagn. It turns out that on x86, these two 'opportunistic' timers only get checked when another "real" timer happens. These opportunistic timers have the objective to save power by hitchhiking on other wakeups, as to avoid CPU wakeups by themselves as much as possible. The change in this patch runs this check not only at timer interrupts, but at all (device) interrupts. The effect is that: 1) the deferred timers/range timers get delayed less 2) the range timers cause less wakeups by themselves because the percentage of hitchhiking on existing wakeup events goes up. I've verified the working of the patch using "perf timechart", the original exposed bug is gone with this patch. Frans also reported success - the latencies are now down in the expected ~10 msec range. Signed-off-by: Arjan van de Ven Tested-by: Frans Pop Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Mike Galbraith LKML-Reference: <20091008064041.67219b13@infradead.org> Signed-off-by: Ingo Molnar --- arch/x86/kernel/irq.c | 2 ++ arch/x86/kernel/smp.c | 1 + 2 files changed, 3 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c index 74656d1d4e30..391206199515 100644 --- a/arch/x86/kernel/irq.c +++ b/arch/x86/kernel/irq.c @@ -244,6 +244,7 @@ unsigned int __irq_entry do_IRQ(struct pt_regs *regs) __func__, smp_processor_id(), vector, irq); } + run_local_timers(); irq_exit(); set_irq_regs(old_regs); @@ -268,6 +269,7 @@ void smp_generic_interrupt(struct pt_regs *regs) if (generic_interrupt_extension) generic_interrupt_extension(); + run_local_timers(); irq_exit(); set_irq_regs(old_regs); diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c index ec1de97600e7..d915d956e66d 100644 --- a/arch/x86/kernel/smp.c +++ b/arch/x86/kernel/smp.c @@ -198,6 +198,7 @@ void smp_reschedule_interrupt(struct pt_regs *regs) { ack_APIC_irq(); inc_irq_stat(irq_resched_count); + run_local_timers(); /* * KVM uses this interrupt to force a cpu out of guest mode */ -- cgit v1.2.3 From d0153ca35d344d9b640dc305031b0703ba3f30f0 Mon Sep 17 00:00:00 2001 From: Alok Kataria Date: Tue, 29 Sep 2009 10:25:24 -0700 Subject: x86, vmi: Mark VMI deprecated and schedule it for removal Add text in feature-removal.txt indicating that VMI will be removed in the 2.6.37 timeframe. Signed-off-by: Alok N Kataria Acked-by: Chris Wright LKML-Reference: <1254193238.13456.48.camel@ank32.eng.vmware.com> [ removed a bogus Kconfig change, marked (DEPRECATED) in Kconfig ] Signed-off-by: H. Peter Anvin Signed-off-by: Ingo Molnar --- Documentation/feature-removal-schedule.txt | 30 ++++++++++++++++++++++++++++++ arch/x86/Kconfig | 11 ++++++++++- arch/x86/kernel/vmi_32.c | 2 +- 3 files changed, 41 insertions(+), 2 deletions(-) (limited to 'arch/x86/kernel') diff --git a/Documentation/feature-removal-schedule.txt b/Documentation/feature-removal-schedule.txt index 89a47b5aff07..04e6c819b28a 100644 --- a/Documentation/feature-removal-schedule.txt +++ b/Documentation/feature-removal-schedule.txt @@ -451,3 +451,33 @@ Why: OSS sound_core grabs all legacy minors (0-255) of SOUND_MAJOR will also allow making ALSA OSS emulation independent of sound_core. The dependency will be broken then too. Who: Tejun Heo + +---------------------------- + +What: Support for VMware's guest paravirtuliazation technique [VMI] will be + dropped. +When: 2.6.37 or earlier. +Why: With the recent innovations in CPU hardware acceleration technologies + from Intel and AMD, VMware ran a few experiments to compare these + techniques to guest paravirtualization technique on VMware's platform. + These hardware assisted virtualization techniques have outperformed the + performance benefits provided by VMI in most of the workloads. VMware + expects that these hardware features will be ubiquitous in a couple of + years, as a result, VMware has started a phased retirement of this + feature from the hypervisor. We will be removing this feature from the + Kernel too. Right now we are targeting 2.6.37 but can retire earlier if + technical reasons (read opportunity to remove major chunk of pvops) + arise. + + Please note that VMI has always been an optimization and non-VMI kernels + still work fine on VMware's platform. + Latest versions of VMware's product which support VMI are, + Workstation 7.0 and VSphere 4.0 on ESX side, future maintainence + releases for these products will continue supporting VMI. + + For more details about VMI retirement take a look at this, + http://blogs.vmware.com/guestosguide/2009/09/vmi-retirement.html + +Who: Alok N Kataria + +---------------------------- diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index c876bace8fdc..07e01149e3bf 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -491,7 +491,7 @@ if PARAVIRT_GUEST source "arch/x86/xen/Kconfig" config VMI - bool "VMI Guest support" + bool "VMI Guest support (DEPRECATED)" select PARAVIRT depends on X86_32 ---help--- @@ -500,6 +500,15 @@ config VMI at the moment), by linking the kernel to a GPL-ed ROM module provided by the hypervisor. + As of September 2009, VMware has started a phased retirement + of this feature from VMware's products. Please see + feature-removal-schedule.txt for details. If you are + planning to enable this option, please note that you cannot + live migrate a VMI enabled VM to a future VMware product, + which doesn't support VMI. So if you expect your kernel to + seamlessly migrate to newer VMware products, keep this + disabled. + config KVM_CLOCK bool "KVM paravirtualized clock" select PARAVIRT diff --git a/arch/x86/kernel/vmi_32.c b/arch/x86/kernel/vmi_32.c index 31e6f6cfe53e..d430e4c30193 100644 --- a/arch/x86/kernel/vmi_32.c +++ b/arch/x86/kernel/vmi_32.c @@ -648,7 +648,7 @@ static inline int __init activate_vmi(void) pv_info.paravirt_enabled = 1; pv_info.kernel_rpl = kernel_cs & SEGMENT_RPL_MASK; - pv_info.name = "vmi"; + pv_info.name = "vmi [deprecated]"; pv_init_ops.patch = vmi_patch; -- cgit v1.2.3 From e7ab0f7b50bc4688fb5cf65de5d42e3b882fb8d1 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 9 Oct 2009 15:58:20 +0200 Subject: Revert "x86, timers: Check for pending timers after (device) interrupts" This reverts commit 9bcbdd9c58617f1301dd4f17c738bb9bc73aca70. The real bug producing LatencyTop latencies has been fixed in: f5dc375: sched: Update the clock of runqueue select_task_rq() selected And the commit being reverted here triggers local timer processing from every device IRQ. If device IRQs come in at a high frequency, this could cause a performance regression. The commit being reverted here purely 'fixed' the reported latency as a side effect, because CPUs were being moved out of idle more often. Acked-by: Peter Zijlstra Cc: Arjan van de Ven Cc: Frans Pop Cc: Linus Torvalds Cc: Mike Galbraith Cc: Thomas Gleixner LKML-Reference: <20091008064041.67219b13@infradead.org> Signed-off-by: Ingo Molnar --- arch/x86/kernel/irq.c | 2 -- arch/x86/kernel/smp.c | 1 - 2 files changed, 3 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c index 391206199515..74656d1d4e30 100644 --- a/arch/x86/kernel/irq.c +++ b/arch/x86/kernel/irq.c @@ -244,7 +244,6 @@ unsigned int __irq_entry do_IRQ(struct pt_regs *regs) __func__, smp_processor_id(), vector, irq); } - run_local_timers(); irq_exit(); set_irq_regs(old_regs); @@ -269,7 +268,6 @@ void smp_generic_interrupt(struct pt_regs *regs) if (generic_interrupt_extension) generic_interrupt_extension(); - run_local_timers(); irq_exit(); set_irq_regs(old_regs); diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c index d915d956e66d..ec1de97600e7 100644 --- a/arch/x86/kernel/smp.c +++ b/arch/x86/kernel/smp.c @@ -198,7 +198,6 @@ void smp_reschedule_interrupt(struct pt_regs *regs) { ack_APIC_irq(); inc_irq_stat(irq_resched_count); - run_local_timers(); /* * KVM uses this interrupt to force a cpu out of guest mode */ -- cgit v1.2.3 From c5cca146aa03e1f60fb179df65f0dbaf17bc64ed Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Fri, 9 Oct 2009 18:31:20 +0200 Subject: x86/amd-iommu: Workaround for erratum 63 There is an erratum for IOMMU hardware which documents undefined behavior when forwarding SMI requests from peripherals and the DTE of that peripheral has a sysmgt value of 01b. This problem caused weird IO_PAGE_FAULTS in my case. This patch implements the suggested workaround for that erratum into the AMD IOMMU driver. The erratum is documented with number 63. Cc: stable@kernel.org Signed-off-by: Joerg Roedel --- arch/x86/include/asm/amd_iommu.h | 1 + arch/x86/kernel/amd_iommu.c | 2 ++ arch/x86/kernel/amd_iommu_init.c | 22 ++++++++++++++++++++++ 3 files changed, 25 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/include/asm/amd_iommu.h b/arch/x86/include/asm/amd_iommu.h index bdf96f119f06..9dbd4030f0cd 100644 --- a/arch/x86/include/asm/amd_iommu.h +++ b/arch/x86/include/asm/amd_iommu.h @@ -30,6 +30,7 @@ extern irqreturn_t amd_iommu_int_handler(int irq, void *data); extern void amd_iommu_flush_all_domains(void); extern void amd_iommu_flush_all_devices(void); extern void amd_iommu_shutdown(void); +extern void amd_iommu_apply_erratum_63(u16 devid); #else static inline int amd_iommu_init(void) { return -ENODEV; } static inline void amd_iommu_detect(void) { } diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c index 6c99f5037801..f95dfe526444 100644 --- a/arch/x86/kernel/amd_iommu.c +++ b/arch/x86/kernel/amd_iommu.c @@ -1114,6 +1114,8 @@ static void __detach_device(struct protection_domain *domain, u16 devid) amd_iommu_dev_table[devid].data[1] = 0; amd_iommu_dev_table[devid].data[2] = 0; + amd_iommu_apply_erratum_63(devid); + /* decrease reference counter */ domain->dev_cnt -= 1; diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c index c1b17e97252e..498c8c79aaa5 100644 --- a/arch/x86/kernel/amd_iommu_init.c +++ b/arch/x86/kernel/amd_iommu_init.c @@ -509,6 +509,26 @@ static void set_dev_entry_bit(u16 devid, u8 bit) amd_iommu_dev_table[devid].data[i] |= (1 << _bit); } +static int get_dev_entry_bit(u16 devid, u8 bit) +{ + int i = (bit >> 5) & 0x07; + int _bit = bit & 0x1f; + + return (amd_iommu_dev_table[devid].data[i] & (1 << _bit)) >> _bit; +} + + +void amd_iommu_apply_erratum_63(u16 devid) +{ + int sysmgt; + + sysmgt = get_dev_entry_bit(devid, DEV_ENTRY_SYSMGT1) | + (get_dev_entry_bit(devid, DEV_ENTRY_SYSMGT2) << 1); + + if (sysmgt == 0x01) + set_dev_entry_bit(devid, DEV_ENTRY_IW); +} + /* Writes the specific IOMMU for a device into the rlookup table */ static void __init set_iommu_for_device(struct amd_iommu *iommu, u16 devid) { @@ -537,6 +557,8 @@ static void __init set_dev_entry_from_acpi(struct amd_iommu *iommu, if (flags & ACPI_DEVFLAG_LINT1) set_dev_entry_bit(devid, DEV_ENTRY_LINT1_PASS); + amd_iommu_apply_erratum_63(devid); + set_iommu_for_device(iommu, devid); } -- cgit v1.2.3 From d43c36dc6b357fa1806800f18aa30123c747a6d1 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Wed, 7 Oct 2009 17:09:06 +0400 Subject: headers: remove sched.h from interrupt.h After m68k's task_thread_info() doesn't refer to current, it's possible to remove sched.h from interrupt.h and not break m68k! Many thanks to Heiko Carstens for allowing this. Signed-off-by: Alexey Dobriyan --- arch/arm/kernel/time.c | 1 + arch/arm/mach-integrator/pci_v3.c | 1 + arch/arm/plat-s3c24xx/adc.c | 1 + arch/blackfin/kernel/time.c | 1 + arch/m32r/kernel/smp.c | 1 + arch/um/drivers/line.c | 1 + arch/um/drivers/port_kern.c | 1 + arch/um/kernel/irq.c | 1 + arch/x86/kernel/cpu/mcheck/mce_intel.c | 1 + arch/x86/kernel/pci-gart_64.c | 1 + arch/x86/kernel/reboot.c | 1 + arch/xtensa/kernel/time.c | 1 + crypto/aead.c | 1 + drivers/char/applicom.c | 1 + drivers/char/epca.c | 1 + drivers/char/generic_serial.c | 1 + drivers/char/istallion.c | 1 + drivers/char/nozomi.c | 1 + drivers/char/pty.c | 1 + drivers/char/rio/riocmd.c | 1 + drivers/char/rio/rioctrl.c | 1 + drivers/char/rio/riotty.c | 1 + drivers/char/ser_a2232.c | 1 + drivers/char/stallion.c | 1 + drivers/char/tlclk.c | 1 + drivers/hwmon/sht15.c | 1 + drivers/ieee1394/raw1394.c | 1 + drivers/ieee1394/video1394.c | 1 + drivers/infiniband/core/iwcm.c | 1 + drivers/infiniband/core/ucma.c | 1 + drivers/infiniband/hw/cxgb3/iwch_provider.c | 1 + drivers/infiniband/hw/cxgb3/iwch_qp.c | 1 + drivers/infiniband/hw/ipath/ipath_driver.c | 1 + drivers/infiniband/hw/ipath/ipath_iba7220.c | 1 + drivers/infiniband/hw/ipath/ipath_intr.c | 1 + drivers/infiniband/hw/ipath/ipath_qp.c | 1 + drivers/infiniband/hw/ipath/ipath_ruc.c | 1 + drivers/infiniband/hw/ipath/ipath_ud.c | 1 + drivers/infiniband/hw/ipath/ipath_user_pages.c | 1 + drivers/infiniband/hw/ipath/ipath_user_sdma.c | 1 + drivers/infiniband/hw/ipath/ipath_verbs_mcast.c | 1 + drivers/input/keyboard/hilkbd.c | 1 + drivers/input/keyboard/sunkbd.c | 1 + drivers/input/serio/libps2.c | 1 + drivers/input/serio/serio_raw.c | 1 + drivers/input/serio/serport.c | 1 + drivers/isdn/capi/kcapi.c | 1 + drivers/isdn/hisax/arcofi.c | 1 + drivers/isdn/hisax/hfc_2bds0.c | 1 + drivers/isdn/hisax/hfc_pci.c | 1 + drivers/isdn/hysdn/hysdn_procconf.c | 1 + drivers/isdn/hysdn/hysdn_proclog.c | 1 + drivers/isdn/pcbit/drv.c | 1 + drivers/isdn/pcbit/layer2.c | 1 + drivers/isdn/sc/init.c | 1 + drivers/lguest/interrupts_and_traps.c | 1 + drivers/media/dvb/dvb-core/dvb_net.c | 1 + drivers/media/video/meye.c | 1 + drivers/media/video/videobuf-core.c | 1 + drivers/media/video/videobuf-dma-sg.c | 1 + drivers/message/fusion/mptlan.c | 1 + drivers/mfd/ucb1x00-core.c | 1 + drivers/misc/hpilo.c | 1 + drivers/misc/ibmasm/command.c | 1 + drivers/misc/ibmasm/event.c | 1 + drivers/misc/ibmasm/r_heartbeat.c | 1 + drivers/misc/phantom.c | 1 + drivers/mtd/devices/m25p80.c | 1 + drivers/mtd/devices/sst25l.c | 1 + drivers/net/bonding/bond_sysfs.c | 1 + drivers/net/depca.c | 1 + drivers/net/e100.c | 1 + drivers/net/eql.c | 1 + drivers/net/ethoc.c | 1 + drivers/net/ewrk3.c | 1 + drivers/net/forcedeth.c | 1 + drivers/net/hamachi.c | 1 + drivers/net/hamradio/baycom_epp.c | 1 + drivers/net/hamradio/baycom_ser_fdx.c | 1 + drivers/net/hamradio/baycom_ser_hdx.c | 1 + drivers/net/hamradio/hdlcdrv.c | 1 + drivers/net/hp100.c | 1 + drivers/net/igb/igb_ethtool.c | 1 + drivers/net/irda/toim3232-sir.c | 1 + drivers/net/ns83820.c | 1 + drivers/net/pcnet32.c | 1 + drivers/net/sb1000.c | 1 + drivers/net/sis900.c | 1 + drivers/net/skfp/skfddi.c | 1 + drivers/net/skge.c | 1 + drivers/net/slip.c | 1 + drivers/net/sungem.c | 1 + drivers/net/tokenring/ibmtr.c | 1 + drivers/net/typhoon.c | 1 + drivers/net/wan/cosa.c | 1 + drivers/net/wan/cycx_x25.c | 1 + drivers/net/wan/dscc4.c | 1 + drivers/net/wan/farsync.c | 1 + drivers/net/wireless/b43/pio.c | 1 + drivers/net/wireless/b43legacy/main.c | 1 + drivers/net/wireless/b43legacy/phy.c | 1 + drivers/net/wireless/hostap/hostap_info.c | 1 + drivers/net/wireless/hostap/hostap_ioctl.c | 1 + drivers/net/wireless/ipw2x00/ipw2200.c | 1 + drivers/net/wireless/iwlwifi/iwl-3945.c | 1 + drivers/net/wireless/iwlwifi/iwl-4965.c | 1 + drivers/net/wireless/iwlwifi/iwl-5000.c | 1 + drivers/net/wireless/iwlwifi/iwl-agn.c | 1 + drivers/net/wireless/iwlwifi/iwl-core.c | 1 + drivers/net/wireless/iwlwifi/iwl-hcmd.c | 1 + drivers/net/wireless/iwlwifi/iwl-tx.c | 1 + drivers/net/wireless/iwlwifi/iwl3945-base.c | 1 + drivers/net/wireless/iwmc3200wifi/cfg80211.c | 1 + drivers/net/wireless/iwmc3200wifi/commands.c | 1 + drivers/net/wireless/iwmc3200wifi/main.c | 1 + drivers/net/wireless/iwmc3200wifi/rx.c | 1 + drivers/net/wireless/libertas/cmd.c | 1 + drivers/net/wireless/libertas/tx.c | 1 + drivers/net/wireless/prism54/isl_ioctl.c | 1 + drivers/net/wireless/prism54/islpci_dev.c | 1 + drivers/net/wireless/prism54/islpci_mgt.c | 1 + drivers/net/wireless/rt2x00/rt2x00debug.c | 1 + drivers/pci/pcie/aer/aerdrv.c | 1 + drivers/rtc/interface.c | 1 + drivers/rtc/rtc-dev.c | 1 + drivers/uio/uio.c | 1 + drivers/uwb/whc-rc.c | 1 + fs/file.c | 1 + include/linux/interrupt.h | 2 +- include/linux/mmc/host.h | 1 + kernel/irq/handle.c | 1 + kernel/mutex-debug.c | 1 + kernel/time/timekeeping.c | 1 + lib/debugobjects.c | 1 + lib/fault-inject.c | 1 + mm/vmalloc.c | 1 + net/irda/ircomm/ircomm_tty_attach.c | 1 + net/irda/irlan/irlan_common.c | 1 + net/irda/irlan/irlan_eth.c | 1 + net/irda/irnet/irnet_irda.c | 1 + net/irda/irnet/irnet_ppp.c | 1 + net/mac80211/rc80211_pid_debugfs.c | 1 + net/netfilter/nf_conntrack_core.c | 1 + net/sunrpc/xprtrdma/svc_rdma_transport.c | 1 + net/wireless/core.c | 1 + 145 files changed, 145 insertions(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/arm/kernel/time.c b/arch/arm/kernel/time.c index 4cdc4a0bd02d..d38cdf2c8276 100644 --- a/arch/arm/kernel/time.c +++ b/arch/arm/kernel/time.c @@ -21,6 +21,7 @@ #include #include #include +#include #include #include #include diff --git a/arch/arm/mach-integrator/pci_v3.c b/arch/arm/mach-integrator/pci_v3.c index 901cc205015e..148d25fc636f 100644 --- a/arch/arm/mach-integrator/pci_v3.c +++ b/arch/arm/mach-integrator/pci_v3.c @@ -31,6 +31,7 @@ #include #include +#include #include #include #include diff --git a/arch/arm/plat-s3c24xx/adc.c b/arch/arm/plat-s3c24xx/adc.c index 11117a7ba911..4d36b784fb8b 100644 --- a/arch/arm/plat-s3c24xx/adc.c +++ b/arch/arm/plat-s3c24xx/adc.c @@ -14,6 +14,7 @@ #include #include #include +#include #include #include #include diff --git a/arch/blackfin/kernel/time.c b/arch/blackfin/kernel/time.c index e5069fe6861e..bd3b53da295e 100644 --- a/arch/blackfin/kernel/time.c +++ b/arch/blackfin/kernel/time.c @@ -14,6 +14,7 @@ #include #include #include +#include #include #include diff --git a/arch/m32r/kernel/smp.c b/arch/m32r/kernel/smp.c index 1b7598e6f6e8..8a88f1f0a3e2 100644 --- a/arch/m32r/kernel/smp.c +++ b/arch/m32r/kernel/smp.c @@ -17,6 +17,7 @@ #include #include +#include #include #include #include diff --git a/arch/um/drivers/line.c b/arch/um/drivers/line.c index 14a102e877d6..cf8a97f34518 100644 --- a/arch/um/drivers/line.c +++ b/arch/um/drivers/line.c @@ -5,6 +5,7 @@ #include "linux/irqreturn.h" #include "linux/kd.h" +#include "linux/sched.h" #include "chan_kern.h" #include "irq_kern.h" #include "irq_user.h" diff --git a/arch/um/drivers/port_kern.c b/arch/um/drivers/port_kern.c index 19930081d3d8..4ebc8a34738f 100644 --- a/arch/um/drivers/port_kern.c +++ b/arch/um/drivers/port_kern.c @@ -7,6 +7,7 @@ #include "linux/interrupt.h" #include "linux/list.h" #include "linux/mutex.h" +#include "linux/workqueue.h" #include "asm/atomic.h" #include "init.h" #include "irq_kern.h" diff --git a/arch/um/kernel/irq.c b/arch/um/kernel/irq.c index 454cdb43e351..039270b9b73b 100644 --- a/arch/um/kernel/irq.c +++ b/arch/um/kernel/irq.c @@ -10,6 +10,7 @@ #include "linux/interrupt.h" #include "linux/kernel_stat.h" #include "linux/module.h" +#include "linux/sched.h" #include "linux/seq_file.h" #include "as-layout.h" #include "kern_util.h" diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel.c b/arch/x86/kernel/cpu/mcheck/mce_intel.c index 889f665fe93d..7c785634af2b 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_intel.c +++ b/arch/x86/kernel/cpu/mcheck/mce_intel.c @@ -8,6 +8,7 @@ #include #include #include +#include #include #include #include diff --git a/arch/x86/kernel/pci-gart_64.c b/arch/x86/kernel/pci-gart_64.c index 98a827ee9ed7..a7f1b64f86e0 100644 --- a/arch/x86/kernel/pci-gart_64.c +++ b/arch/x86/kernel/pci-gart_64.c @@ -16,6 +16,7 @@ #include #include #include +#include #include #include #include diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c index 27349f92a6d7..a1a3cdda06e1 100644 --- a/arch/x86/kernel/reboot.c +++ b/arch/x86/kernel/reboot.c @@ -4,6 +4,7 @@ #include #include #include +#include #include #include #include diff --git a/arch/xtensa/kernel/time.c b/arch/xtensa/kernel/time.c index 19085ff0484a..19f7df30937f 100644 --- a/arch/xtensa/kernel/time.c +++ b/arch/xtensa/kernel/time.c @@ -13,6 +13,7 @@ */ #include +#include #include #include #include diff --git a/crypto/aead.c b/crypto/aead.c index d9aa733db164..0a55da70845e 100644 --- a/crypto/aead.c +++ b/crypto/aead.c @@ -18,6 +18,7 @@ #include #include #include +#include #include #include diff --git a/drivers/char/applicom.c b/drivers/char/applicom.c index 73a0765344b6..fe2cb2f5db17 100644 --- a/drivers/char/applicom.c +++ b/drivers/char/applicom.c @@ -23,6 +23,7 @@ #include #include #include +#include #include #include #include diff --git a/drivers/char/epca.c b/drivers/char/epca.c index 9d589e3144de..dde5134713e2 100644 --- a/drivers/char/epca.c +++ b/drivers/char/epca.c @@ -30,6 +30,7 @@ #include #include #include +#include #include #include #include diff --git a/drivers/char/generic_serial.c b/drivers/char/generic_serial.c index 9e4e569dc00d..d400cbd280f2 100644 --- a/drivers/char/generic_serial.c +++ b/drivers/char/generic_serial.c @@ -22,6 +22,7 @@ #include #include #include +#include #include #include #include diff --git a/drivers/char/istallion.c b/drivers/char/istallion.c index ab2f3349c5c4..402838f4083e 100644 --- a/drivers/char/istallion.c +++ b/drivers/char/istallion.c @@ -19,6 +19,7 @@ /*****************************************************************************/ #include +#include #include #include #include diff --git a/drivers/char/nozomi.c b/drivers/char/nozomi.c index ec58d8c387ff..d3400b20444f 100644 --- a/drivers/char/nozomi.c +++ b/drivers/char/nozomi.c @@ -48,6 +48,7 @@ #include #include #include +#include #include #include #include diff --git a/drivers/char/pty.c b/drivers/char/pty.c index e066c4fdf81b..62f282e67638 100644 --- a/drivers/char/pty.c +++ b/drivers/char/pty.c @@ -18,6 +18,7 @@ #include #include #include +#include #include #include #include diff --git a/drivers/char/rio/riocmd.c b/drivers/char/rio/riocmd.c index 01f2654d5a2e..f121357e5af0 100644 --- a/drivers/char/rio/riocmd.c +++ b/drivers/char/rio/riocmd.c @@ -32,6 +32,7 @@ */ #include +#include #include #include #include diff --git a/drivers/char/rio/rioctrl.c b/drivers/char/rio/rioctrl.c index 74339559f0b9..780506326a73 100644 --- a/drivers/char/rio/rioctrl.c +++ b/drivers/char/rio/rioctrl.c @@ -31,6 +31,7 @@ */ #include +#include #include #include #include diff --git a/drivers/char/rio/riotty.c b/drivers/char/rio/riotty.c index 2fb49e89b324..47fab7c33073 100644 --- a/drivers/char/rio/riotty.c +++ b/drivers/char/rio/riotty.c @@ -33,6 +33,7 @@ #define __EXPLICIT_DEF_H__ #include +#include #include #include #include diff --git a/drivers/char/ser_a2232.c b/drivers/char/ser_a2232.c index 33a2b531802e..9610861d1f5f 100644 --- a/drivers/char/ser_a2232.c +++ b/drivers/char/ser_a2232.c @@ -89,6 +89,7 @@ #include #include #include +#include #include #include diff --git a/drivers/char/stallion.c b/drivers/char/stallion.c index 53e504f41b20..db6dcfa35ba0 100644 --- a/drivers/char/stallion.c +++ b/drivers/char/stallion.c @@ -27,6 +27,7 @@ /*****************************************************************************/ #include +#include #include #include #include diff --git a/drivers/char/tlclk.c b/drivers/char/tlclk.c index 8f2284be68e1..80ea6bcfffdc 100644 --- a/drivers/char/tlclk.c +++ b/drivers/char/tlclk.c @@ -32,6 +32,7 @@ #include /* printk() */ #include /* everything... */ #include /* error codes */ +#include #include #include #include diff --git a/drivers/hwmon/sht15.c b/drivers/hwmon/sht15.c index 303c02694c3c..2da6fb2c325e 100644 --- a/drivers/hwmon/sht15.c +++ b/drivers/hwmon/sht15.c @@ -30,6 +30,7 @@ #include #include #include +#include #include #include #include diff --git a/drivers/ieee1394/raw1394.c b/drivers/ieee1394/raw1394.c index 0bc3d78ce7b1..8aa56ac07e29 100644 --- a/drivers/ieee1394/raw1394.c +++ b/drivers/ieee1394/raw1394.c @@ -29,6 +29,7 @@ #include #include +#include #include #include #include diff --git a/drivers/ieee1394/video1394.c b/drivers/ieee1394/video1394.c index d287ba79821d..949064a05675 100644 --- a/drivers/ieee1394/video1394.c +++ b/drivers/ieee1394/video1394.c @@ -30,6 +30,7 @@ */ #include #include +#include #include #include #include diff --git a/drivers/infiniband/core/iwcm.c b/drivers/infiniband/core/iwcm.c index 625fec5a741c..0f89909abce9 100644 --- a/drivers/infiniband/core/iwcm.c +++ b/drivers/infiniband/core/iwcm.c @@ -40,6 +40,7 @@ #include #include #include +#include #include #include #include diff --git a/drivers/infiniband/core/ucma.c b/drivers/infiniband/core/ucma.c index 4346a24568fb..bb96d3c4b0f4 100644 --- a/drivers/infiniband/core/ucma.c +++ b/drivers/infiniband/core/ucma.c @@ -34,6 +34,7 @@ #include #include #include +#include #include #include #include diff --git a/drivers/infiniband/hw/cxgb3/iwch_provider.c b/drivers/infiniband/hw/cxgb3/iwch_provider.c index 03cfaecc3bb7..ed7175549ebd 100644 --- a/drivers/infiniband/hw/cxgb3/iwch_provider.c +++ b/drivers/infiniband/hw/cxgb3/iwch_provider.c @@ -37,6 +37,7 @@ #include #include #include +#include #include #include #include diff --git a/drivers/infiniband/hw/cxgb3/iwch_qp.c b/drivers/infiniband/hw/cxgb3/iwch_qp.c index 6e8653471941..1cecf98829ac 100644 --- a/drivers/infiniband/hw/cxgb3/iwch_qp.c +++ b/drivers/infiniband/hw/cxgb3/iwch_qp.c @@ -29,6 +29,7 @@ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ +#include #include "iwch_provider.h" #include "iwch.h" #include "iwch_cm.h" diff --git a/drivers/infiniband/hw/ipath/ipath_driver.c b/drivers/infiniband/hw/ipath/ipath_driver.c index 04e88b600558..013d1380e77c 100644 --- a/drivers/infiniband/hw/ipath/ipath_driver.c +++ b/drivers/infiniband/hw/ipath/ipath_driver.c @@ -31,6 +31,7 @@ * SOFTWARE. */ +#include #include #include #include diff --git a/drivers/infiniband/hw/ipath/ipath_iba7220.c b/drivers/infiniband/hw/ipath/ipath_iba7220.c index b2a9d4c155d1..a805402dd4ae 100644 --- a/drivers/infiniband/hw/ipath/ipath_iba7220.c +++ b/drivers/infiniband/hw/ipath/ipath_iba7220.c @@ -37,6 +37,7 @@ #include #include +#include #include #include #include diff --git a/drivers/infiniband/hw/ipath/ipath_intr.c b/drivers/infiniband/hw/ipath/ipath_intr.c index 6c21b4b5ec71..c0a03ac03ee7 100644 --- a/drivers/infiniband/hw/ipath/ipath_intr.c +++ b/drivers/infiniband/hw/ipath/ipath_intr.c @@ -33,6 +33,7 @@ #include #include +#include #include "ipath_kernel.h" #include "ipath_verbs.h" diff --git a/drivers/infiniband/hw/ipath/ipath_qp.c b/drivers/infiniband/hw/ipath/ipath_qp.c index 3a5a89b609c4..cb2d3ef2ae12 100644 --- a/drivers/infiniband/hw/ipath/ipath_qp.c +++ b/drivers/infiniband/hw/ipath/ipath_qp.c @@ -32,6 +32,7 @@ */ #include +#include #include #include "ipath_verbs.h" diff --git a/drivers/infiniband/hw/ipath/ipath_ruc.c b/drivers/infiniband/hw/ipath/ipath_ruc.c index 2296832f94da..1f95bbaf7602 100644 --- a/drivers/infiniband/hw/ipath/ipath_ruc.c +++ b/drivers/infiniband/hw/ipath/ipath_ruc.c @@ -31,6 +31,7 @@ * SOFTWARE. */ +#include #include #include "ipath_verbs.h" diff --git a/drivers/infiniband/hw/ipath/ipath_ud.c b/drivers/infiniband/hw/ipath/ipath_ud.c index 6076cb61bf6a..7420715256a9 100644 --- a/drivers/infiniband/hw/ipath/ipath_ud.c +++ b/drivers/infiniband/hw/ipath/ipath_ud.c @@ -31,6 +31,7 @@ * SOFTWARE. */ +#include #include #include "ipath_verbs.h" diff --git a/drivers/infiniband/hw/ipath/ipath_user_pages.c b/drivers/infiniband/hw/ipath/ipath_user_pages.c index 855911e7396d..82878e348627 100644 --- a/drivers/infiniband/hw/ipath/ipath_user_pages.c +++ b/drivers/infiniband/hw/ipath/ipath_user_pages.c @@ -33,6 +33,7 @@ #include #include +#include #include "ipath_kernel.h" diff --git a/drivers/infiniband/hw/ipath/ipath_user_sdma.c b/drivers/infiniband/hw/ipath/ipath_user_sdma.c index 7bff4b9baa0a..be78f6643c06 100644 --- a/drivers/infiniband/hw/ipath/ipath_user_sdma.c +++ b/drivers/infiniband/hw/ipath/ipath_user_sdma.c @@ -33,6 +33,7 @@ #include #include #include +#include #include #include #include diff --git a/drivers/infiniband/hw/ipath/ipath_verbs_mcast.c b/drivers/infiniband/hw/ipath/ipath_verbs_mcast.c index d73e32232879..6923e1d986da 100644 --- a/drivers/infiniband/hw/ipath/ipath_verbs_mcast.c +++ b/drivers/infiniband/hw/ipath/ipath_verbs_mcast.c @@ -32,6 +32,7 @@ */ #include +#include #include "ipath_verbs.h" diff --git a/drivers/input/keyboard/hilkbd.c b/drivers/input/keyboard/hilkbd.c index e9d639ec283d..5f72440b50c8 100644 --- a/drivers/input/keyboard/hilkbd.c +++ b/drivers/input/keyboard/hilkbd.c @@ -24,6 +24,7 @@ #include #include #include +#include #include #include #ifdef CONFIG_HP300 diff --git a/drivers/input/keyboard/sunkbd.c b/drivers/input/keyboard/sunkbd.c index 472b56639cdb..a99a04b03ee4 100644 --- a/drivers/input/keyboard/sunkbd.c +++ b/drivers/input/keyboard/sunkbd.c @@ -27,6 +27,7 @@ */ #include +#include #include #include #include diff --git a/drivers/input/serio/libps2.c b/drivers/input/serio/libps2.c index 769ba65a585a..f3876acc3e83 100644 --- a/drivers/input/serio/libps2.c +++ b/drivers/input/serio/libps2.c @@ -13,6 +13,7 @@ #include #include +#include #include #include #include diff --git a/drivers/input/serio/serio_raw.c b/drivers/input/serio/serio_raw.c index b03009bb7468..27fdaaffbb40 100644 --- a/drivers/input/serio/serio_raw.c +++ b/drivers/input/serio/serio_raw.c @@ -9,6 +9,7 @@ * the Free Software Foundation. */ +#include #include #include #include diff --git a/drivers/input/serio/serport.c b/drivers/input/serio/serport.c index b9694b6445d0..6d345112bcb7 100644 --- a/drivers/input/serio/serport.c +++ b/drivers/input/serio/serport.c @@ -15,6 +15,7 @@ #include #include +#include #include #include #include diff --git a/drivers/isdn/capi/kcapi.c b/drivers/isdn/capi/kcapi.c index 57d26360f64e..dc506ab99cac 100644 --- a/drivers/isdn/capi/kcapi.c +++ b/drivers/isdn/capi/kcapi.c @@ -18,6 +18,7 @@ #include #include #include +#include #include #include #include diff --git a/drivers/isdn/hisax/arcofi.c b/drivers/isdn/hisax/arcofi.c index d30ce5b978c2..85a8fd8dd0b7 100644 --- a/drivers/isdn/hisax/arcofi.c +++ b/drivers/isdn/hisax/arcofi.c @@ -10,6 +10,7 @@ * */ +#include #include "hisax.h" #include "isdnl1.h" #include "isac.h" diff --git a/drivers/isdn/hisax/hfc_2bds0.c b/drivers/isdn/hisax/hfc_2bds0.c index 5c46a7130e06..8d22f50760eb 100644 --- a/drivers/isdn/hisax/hfc_2bds0.c +++ b/drivers/isdn/hisax/hfc_2bds0.c @@ -11,6 +11,7 @@ */ #include +#include #include "hisax.h" #include "hfc_2bds0.h" #include "isdnl1.h" diff --git a/drivers/isdn/hisax/hfc_pci.c b/drivers/isdn/hisax/hfc_pci.c index d110a77940a4..10914731b304 100644 --- a/drivers/isdn/hisax/hfc_pci.c +++ b/drivers/isdn/hisax/hfc_pci.c @@ -20,6 +20,7 @@ #include "hfc_pci.h" #include "isdnl1.h" #include +#include #include static const char *hfcpci_revision = "$Revision: 1.48.2.4 $"; diff --git a/drivers/isdn/hysdn/hysdn_procconf.c b/drivers/isdn/hysdn/hysdn_procconf.c index 8f9f4912de32..90b35e1a4b7e 100644 --- a/drivers/isdn/hysdn/hysdn_procconf.c +++ b/drivers/isdn/hysdn/hysdn_procconf.c @@ -11,6 +11,7 @@ * */ +#include #include #include #include diff --git a/drivers/isdn/hysdn/hysdn_proclog.c b/drivers/isdn/hysdn/hysdn_proclog.c index 8991d2c8ee4a..8bcae28c4409 100644 --- a/drivers/isdn/hysdn/hysdn_proclog.c +++ b/drivers/isdn/hysdn/hysdn_proclog.c @@ -13,6 +13,7 @@ #include #include #include +#include #include #include "hysdn_defs.h" diff --git a/drivers/isdn/pcbit/drv.c b/drivers/isdn/pcbit/drv.c index 8c66bcb953a1..123c1d6c43b4 100644 --- a/drivers/isdn/pcbit/drv.c +++ b/drivers/isdn/pcbit/drv.c @@ -23,6 +23,7 @@ #include #include +#include #include #include #include diff --git a/drivers/isdn/pcbit/layer2.c b/drivers/isdn/pcbit/layer2.c index e075e8d2fce0..30f0f45e3139 100644 --- a/drivers/isdn/pcbit/layer2.c +++ b/drivers/isdn/pcbit/layer2.c @@ -27,6 +27,7 @@ #include #include #include +#include #include #include #include diff --git a/drivers/isdn/sc/init.c b/drivers/isdn/sc/init.c index dd0acd06750b..5a0774880d56 100644 --- a/drivers/isdn/sc/init.c +++ b/drivers/isdn/sc/init.c @@ -8,6 +8,7 @@ #include #include #include +#include #include "includes.h" #include "hardware.h" #include "card.h" diff --git a/drivers/lguest/interrupts_and_traps.c b/drivers/lguest/interrupts_and_traps.c index 18648180db02..daaf86631647 100644 --- a/drivers/lguest/interrupts_and_traps.c +++ b/drivers/lguest/interrupts_and_traps.c @@ -16,6 +16,7 @@ #include #include #include +#include #include "lg.h" /* Allow Guests to use a non-128 (ie. non-Linux) syscall trap. */ diff --git a/drivers/media/dvb/dvb-core/dvb_net.c b/drivers/media/dvb/dvb-core/dvb_net.c index 8c9ae0a3a272..0241a7c5c34a 100644 --- a/drivers/media/dvb/dvb-core/dvb_net.c +++ b/drivers/media/dvb/dvb-core/dvb_net.c @@ -63,6 +63,7 @@ #include #include #include +#include #include "dvb_demux.h" #include "dvb_net.h" diff --git a/drivers/media/video/meye.c b/drivers/media/video/meye.c index 4b1bc05a462c..01e1eefcf1eb 100644 --- a/drivers/media/video/meye.c +++ b/drivers/media/video/meye.c @@ -28,6 +28,7 @@ */ #include #include +#include #include #include #include diff --git a/drivers/media/video/videobuf-core.c b/drivers/media/video/videobuf-core.c index f1ccf98c0a6f..8e93c6f25c83 100644 --- a/drivers/media/video/videobuf-core.c +++ b/drivers/media/video/videobuf-core.c @@ -17,6 +17,7 @@ #include #include #include +#include #include #include diff --git a/drivers/media/video/videobuf-dma-sg.c b/drivers/media/video/videobuf-dma-sg.c index 53cdd67cebe1..032ebae0134a 100644 --- a/drivers/media/video/videobuf-dma-sg.c +++ b/drivers/media/video/videobuf-dma-sg.c @@ -21,6 +21,7 @@ #include #include #include +#include #include #include diff --git a/drivers/message/fusion/mptlan.c b/drivers/message/fusion/mptlan.c index bc2ec2182c00..34f3f36f819b 100644 --- a/drivers/message/fusion/mptlan.c +++ b/drivers/message/fusion/mptlan.c @@ -56,6 +56,7 @@ #include #include #include +#include #define my_VERSION MPT_LINUX_VERSION_COMMON #define MYNAM "mptlan" diff --git a/drivers/mfd/ucb1x00-core.c b/drivers/mfd/ucb1x00-core.c index fea9085fe52c..60c3988f3cf3 100644 --- a/drivers/mfd/ucb1x00-core.c +++ b/drivers/mfd/ucb1x00-core.c @@ -18,6 +18,7 @@ */ #include #include +#include #include #include #include diff --git a/drivers/misc/hpilo.c b/drivers/misc/hpilo.c index 1ad27c6abcca..a92a3a742b43 100644 --- a/drivers/misc/hpilo.c +++ b/drivers/misc/hpilo.c @@ -18,6 +18,7 @@ #include #include #include +#include #include #include #include diff --git a/drivers/misc/ibmasm/command.c b/drivers/misc/ibmasm/command.c index 276d3fb68094..e2031739aa29 100644 --- a/drivers/misc/ibmasm/command.c +++ b/drivers/misc/ibmasm/command.c @@ -22,6 +22,7 @@ * */ +#include #include "ibmasm.h" #include "lowlevel.h" diff --git a/drivers/misc/ibmasm/event.c b/drivers/misc/ibmasm/event.c index 68a0a5b94795..572d41ffc186 100644 --- a/drivers/misc/ibmasm/event.c +++ b/drivers/misc/ibmasm/event.c @@ -22,6 +22,7 @@ * */ +#include #include "ibmasm.h" #include "lowlevel.h" diff --git a/drivers/misc/ibmasm/r_heartbeat.c b/drivers/misc/ibmasm/r_heartbeat.c index bec9e2c44bef..2de487ac788c 100644 --- a/drivers/misc/ibmasm/r_heartbeat.c +++ b/drivers/misc/ibmasm/r_heartbeat.c @@ -20,6 +20,7 @@ * */ +#include #include "ibmasm.h" #include "dot_command.h" diff --git a/drivers/misc/phantom.c b/drivers/misc/phantom.c index 90a95ce8dc34..04c27266f567 100644 --- a/drivers/misc/phantom.c +++ b/drivers/misc/phantom.c @@ -22,6 +22,7 @@ #include #include #include +#include #include #include diff --git a/drivers/mtd/devices/m25p80.c b/drivers/mtd/devices/m25p80.c index 379c316f329e..4c19269de91a 100644 --- a/drivers/mtd/devices/m25p80.c +++ b/drivers/mtd/devices/m25p80.c @@ -21,6 +21,7 @@ #include #include #include +#include #include #include diff --git a/drivers/mtd/devices/sst25l.c b/drivers/mtd/devices/sst25l.c index c2baf3353f84..0a11721f146e 100644 --- a/drivers/mtd/devices/sst25l.c +++ b/drivers/mtd/devices/sst25l.c @@ -20,6 +20,7 @@ #include #include #include +#include #include #include diff --git a/drivers/net/bonding/bond_sysfs.c b/drivers/net/bonding/bond_sysfs.c index ff449de6f3c0..8762a27a2a18 100644 --- a/drivers/net/bonding/bond_sysfs.c +++ b/drivers/net/bonding/bond_sysfs.c @@ -22,6 +22,7 @@ #include #include #include +#include #include #include #include diff --git a/drivers/net/depca.c b/drivers/net/depca.c index 9686c1fa28f1..7a3bdac84abe 100644 --- a/drivers/net/depca.c +++ b/drivers/net/depca.c @@ -237,6 +237,7 @@ #include #include +#include #include #include #include diff --git a/drivers/net/e100.c b/drivers/net/e100.c index 679965c2bb86..5d2f48f02251 100644 --- a/drivers/net/e100.c +++ b/drivers/net/e100.c @@ -151,6 +151,7 @@ #include #include #include +#include #include #include #include diff --git a/drivers/net/eql.c b/drivers/net/eql.c index d4d9a3eda695..f5b96cadeb25 100644 --- a/drivers/net/eql.c +++ b/drivers/net/eql.c @@ -111,6 +111,7 @@ * Sorry, I had to rewrite most of this for 2.5.x -DaveM */ +#include #include #include #include diff --git a/drivers/net/ethoc.c b/drivers/net/ethoc.c index 34d0c69e67f7..96f5b2a2d2c5 100644 --- a/drivers/net/ethoc.c +++ b/drivers/net/ethoc.c @@ -17,6 +17,7 @@ #include #include #include +#include #include static int buffer_size = 0x8000; /* 32 KBytes */ diff --git a/drivers/net/ewrk3.c b/drivers/net/ewrk3.c index b2a5ec8f3721..dd4ba01fd92d 100644 --- a/drivers/net/ewrk3.c +++ b/drivers/net/ewrk3.c @@ -145,6 +145,7 @@ #include #include +#include #include #include #include diff --git a/drivers/net/forcedeth.c b/drivers/net/forcedeth.c index 0a1c2bb27d4d..e1da4666f204 100644 --- a/drivers/net/forcedeth.c +++ b/drivers/net/forcedeth.c @@ -49,6 +49,7 @@ #include #include #include +#include #include #include #include diff --git a/drivers/net/hamachi.c b/drivers/net/hamachi.c index 1d5064a09aca..f7519a594945 100644 --- a/drivers/net/hamachi.c +++ b/drivers/net/hamachi.c @@ -145,6 +145,7 @@ static int tx_params[MAX_UNITS] = {-1, -1, -1, -1, -1, -1, -1, -1}; /* Time in jiffies before concluding the transmitter is hung. */ #define TX_TIMEOUT (5*HZ) +#include #include #include #include diff --git a/drivers/net/hamradio/baycom_epp.c b/drivers/net/hamradio/baycom_epp.c index 7bcaf7c66243..e344c84c0ef9 100644 --- a/drivers/net/hamradio/baycom_epp.c +++ b/drivers/net/hamradio/baycom_epp.c @@ -44,6 +44,7 @@ #include #include #include +#include #include #include #include diff --git a/drivers/net/hamradio/baycom_ser_fdx.c b/drivers/net/hamradio/baycom_ser_fdx.c index aa4488e871b2..ed60fd664273 100644 --- a/drivers/net/hamradio/baycom_ser_fdx.c +++ b/drivers/net/hamradio/baycom_ser_fdx.c @@ -71,6 +71,7 @@ /*****************************************************************************/ +#include #include #include #include diff --git a/drivers/net/hamradio/baycom_ser_hdx.c b/drivers/net/hamradio/baycom_ser_hdx.c index 88c593596020..1686f6dcbbce 100644 --- a/drivers/net/hamradio/baycom_ser_hdx.c +++ b/drivers/net/hamradio/baycom_ser_hdx.c @@ -61,6 +61,7 @@ /*****************************************************************************/ +#include #include #include #include diff --git a/drivers/net/hamradio/hdlcdrv.c b/drivers/net/hamradio/hdlcdrv.c index 0013c409782c..91c5790c9581 100644 --- a/drivers/net/hamradio/hdlcdrv.c +++ b/drivers/net/hamradio/hdlcdrv.c @@ -42,6 +42,7 @@ /*****************************************************************************/ +#include #include #include #include diff --git a/drivers/net/hp100.c b/drivers/net/hp100.c index a9a1a99f02dd..dd8665138062 100644 --- a/drivers/net/hp100.c +++ b/drivers/net/hp100.c @@ -98,6 +98,7 @@ #include #include +#include #include #include #include diff --git a/drivers/net/igb/igb_ethtool.c b/drivers/net/igb/igb_ethtool.c index d004c359244c..deaea8fa1032 100644 --- a/drivers/net/igb/igb_ethtool.c +++ b/drivers/net/igb/igb_ethtool.c @@ -34,6 +34,7 @@ #include #include #include +#include #include "igb.h" diff --git a/drivers/net/irda/toim3232-sir.c b/drivers/net/irda/toim3232-sir.c index fcf287b749db..99e1ec02a011 100644 --- a/drivers/net/irda/toim3232-sir.c +++ b/drivers/net/irda/toim3232-sir.c @@ -120,6 +120,7 @@ #include #include #include +#include #include diff --git a/drivers/net/ns83820.c b/drivers/net/ns83820.c index c594e1946476..57fd483dbb1f 100644 --- a/drivers/net/ns83820.c +++ b/drivers/net/ns83820.c @@ -111,6 +111,7 @@ #include #include #include +#include #include #include #include diff --git a/drivers/net/pcnet32.c b/drivers/net/pcnet32.c index 6d28b18e7e28..c1b3f09f452c 100644 --- a/drivers/net/pcnet32.c +++ b/drivers/net/pcnet32.c @@ -31,6 +31,7 @@ static const char *const version = #include #include +#include #include #include #include diff --git a/drivers/net/sb1000.c b/drivers/net/sb1000.c index ee366c5a8fa3..c9c70ab0cce0 100644 --- a/drivers/net/sb1000.c +++ b/drivers/net/sb1000.c @@ -36,6 +36,7 @@ static char version[] = "sb1000.c:v1.1.2 6/01/98 (fventuri@mediaone.net)\n"; #include #include +#include #include #include #include diff --git a/drivers/net/sis900.c b/drivers/net/sis900.c index 97949d0a699b..c072f7f36acf 100644 --- a/drivers/net/sis900.c +++ b/drivers/net/sis900.c @@ -52,6 +52,7 @@ #include #include #include +#include #include #include #include diff --git a/drivers/net/skfp/skfddi.c b/drivers/net/skfp/skfddi.c index 38a508b4aad9..b27156eaf267 100644 --- a/drivers/net/skfp/skfddi.c +++ b/drivers/net/skfp/skfddi.c @@ -73,6 +73,7 @@ static const char * const boot_msg = /* Include files */ +#include #include #include #include diff --git a/drivers/net/skge.c b/drivers/net/skge.c index 01f6811f1324..8f5414348e86 100644 --- a/drivers/net/skge.c +++ b/drivers/net/skge.c @@ -37,6 +37,7 @@ #include #include #include +#include #include #include #include diff --git a/drivers/net/slip.c b/drivers/net/slip.c index e17c535a577e..fe3cebb984de 100644 --- a/drivers/net/slip.c +++ b/drivers/net/slip.c @@ -67,6 +67,7 @@ #include #include #include +#include #include #include #include diff --git a/drivers/net/sungem.c b/drivers/net/sungem.c index 305ec3d783db..7019a0d1a82b 100644 --- a/drivers/net/sungem.c +++ b/drivers/net/sungem.c @@ -38,6 +38,7 @@ #include #include #include +#include #include #include #include diff --git a/drivers/net/tokenring/ibmtr.c b/drivers/net/tokenring/ibmtr.c index 525bbc5b9c9d..36cb2423bcf1 100644 --- a/drivers/net/tokenring/ibmtr.c +++ b/drivers/net/tokenring/ibmtr.c @@ -108,6 +108,7 @@ in the event that chatty debug messages are desired - jjs 12/30/98 */ #define IBMTR_DEBUG_MESSAGES 0 #include +#include #ifdef PCMCIA /* required for ibmtr_cs.c to build */ #undef MODULE /* yes, really */ diff --git a/drivers/net/typhoon.c b/drivers/net/typhoon.c index d6d345229fe9..5921f5bdd764 100644 --- a/drivers/net/typhoon.c +++ b/drivers/net/typhoon.c @@ -108,6 +108,7 @@ static const int multicast_filter_limit = 32; #include #include +#include #include #include #include diff --git a/drivers/net/wan/cosa.c b/drivers/net/wan/cosa.c index 66360a2a14c2..e2c33c06190b 100644 --- a/drivers/net/wan/cosa.c +++ b/drivers/net/wan/cosa.c @@ -76,6 +76,7 @@ #include #include +#include #include #include #include diff --git a/drivers/net/wan/cycx_x25.c b/drivers/net/wan/cycx_x25.c index 2573c18b6aa5..cd8cb95c5bd7 100644 --- a/drivers/net/wan/cycx_x25.c +++ b/drivers/net/wan/cycx_x25.c @@ -84,6 +84,7 @@ #include /* printk(), and other useful stuff */ #include #include /* inline memset(), etc. */ +#include #include /* kmalloc(), kfree() */ #include /* offsetof(), etc. */ #include /* WAN router definitions */ diff --git a/drivers/net/wan/dscc4.c b/drivers/net/wan/dscc4.c index 81c8aec9df92..07d00b4cf48a 100644 --- a/drivers/net/wan/dscc4.c +++ b/drivers/net/wan/dscc4.c @@ -81,6 +81,7 @@ */ #include +#include #include #include #include diff --git a/drivers/net/wan/farsync.c b/drivers/net/wan/farsync.c index 3e90eb816181..beda387f2fc7 100644 --- a/drivers/net/wan/farsync.c +++ b/drivers/net/wan/farsync.c @@ -19,6 +19,7 @@ #include #include #include +#include #include #include #include diff --git a/drivers/net/wireless/b43/pio.c b/drivers/net/wireless/b43/pio.c index 9c1397996e0a..5e87650b07fb 100644 --- a/drivers/net/wireless/b43/pio.c +++ b/drivers/net/wireless/b43/pio.c @@ -30,6 +30,7 @@ #include "xmit.h" #include +#include static u16 generate_cookie(struct b43_pio_txqueue *q, diff --git a/drivers/net/wireless/b43legacy/main.c b/drivers/net/wireless/b43legacy/main.c index 1d9223b3d4c4..4b60148a5e61 100644 --- a/drivers/net/wireless/b43legacy/main.c +++ b/drivers/net/wireless/b43legacy/main.c @@ -37,6 +37,7 @@ #include #include #include +#include #include #include #include diff --git a/drivers/net/wireless/b43legacy/phy.c b/drivers/net/wireless/b43legacy/phy.c index 11319ec2d64a..aaf227203a98 100644 --- a/drivers/net/wireless/b43legacy/phy.c +++ b/drivers/net/wireless/b43legacy/phy.c @@ -31,6 +31,7 @@ #include #include +#include #include #include "b43legacy.h" diff --git a/drivers/net/wireless/hostap/hostap_info.c b/drivers/net/wireless/hostap/hostap_info.c index 6fa14a4e4b53..4dfb40a84c96 100644 --- a/drivers/net/wireless/hostap/hostap_info.c +++ b/drivers/net/wireless/hostap/hostap_info.c @@ -1,6 +1,7 @@ /* Host AP driver Info Frame processing (part of hostap.o module) */ #include +#include #include "hostap_wlan.h" #include "hostap.h" #include "hostap_ap.h" diff --git a/drivers/net/wireless/hostap/hostap_ioctl.c b/drivers/net/wireless/hostap/hostap_ioctl.c index 3f2bda881a4f..9419cebca8a5 100644 --- a/drivers/net/wireless/hostap/hostap_ioctl.c +++ b/drivers/net/wireless/hostap/hostap_ioctl.c @@ -1,6 +1,7 @@ /* ioctl() (mostly Linux Wireless Extensions) routines for Host AP driver */ #include +#include #include #include #include diff --git a/drivers/net/wireless/ipw2x00/ipw2200.c b/drivers/net/wireless/ipw2x00/ipw2200.c index 8d58e6ed4e7d..827824d45de9 100644 --- a/drivers/net/wireless/ipw2x00/ipw2200.c +++ b/drivers/net/wireless/ipw2x00/ipw2200.c @@ -30,6 +30,7 @@ ******************************************************************************/ +#include #include "ipw2200.h" diff --git a/drivers/net/wireless/iwlwifi/iwl-3945.c b/drivers/net/wireless/iwlwifi/iwl-3945.c index e70c5b0af364..68136172b823 100644 --- a/drivers/net/wireless/iwlwifi/iwl-3945.c +++ b/drivers/net/wireless/iwlwifi/iwl-3945.c @@ -30,6 +30,7 @@ #include #include #include +#include #include #include #include diff --git a/drivers/net/wireless/iwlwifi/iwl-4965.c b/drivers/net/wireless/iwlwifi/iwl-4965.c index a22a0501c190..6f703a041847 100644 --- a/drivers/net/wireless/iwlwifi/iwl-4965.c +++ b/drivers/net/wireless/iwlwifi/iwl-4965.c @@ -30,6 +30,7 @@ #include #include #include +#include #include #include #include diff --git a/drivers/net/wireless/iwlwifi/iwl-5000.c b/drivers/net/wireless/iwlwifi/iwl-5000.c index eb08f4411000..d6bc0e051043 100644 --- a/drivers/net/wireless/iwlwifi/iwl-5000.c +++ b/drivers/net/wireless/iwlwifi/iwl-5000.c @@ -29,6 +29,7 @@ #include #include #include +#include #include #include #include diff --git a/drivers/net/wireless/iwlwifi/iwl-agn.c b/drivers/net/wireless/iwlwifi/iwl-agn.c index cdc07c477457..313d3e5ee84b 100644 --- a/drivers/net/wireless/iwlwifi/iwl-agn.c +++ b/drivers/net/wireless/iwlwifi/iwl-agn.c @@ -33,6 +33,7 @@ #include #include #include +#include #include #include #include diff --git a/drivers/net/wireless/iwlwifi/iwl-core.c b/drivers/net/wireless/iwlwifi/iwl-core.c index 484d5c1a7312..2dc928755454 100644 --- a/drivers/net/wireless/iwlwifi/iwl-core.c +++ b/drivers/net/wireless/iwlwifi/iwl-core.c @@ -29,6 +29,7 @@ #include #include #include +#include #include #include "iwl-eeprom.h" diff --git a/drivers/net/wireless/iwlwifi/iwl-hcmd.c b/drivers/net/wireless/iwlwifi/iwl-hcmd.c index 532c8d6cd8da..a6856daf14cb 100644 --- a/drivers/net/wireless/iwlwifi/iwl-hcmd.c +++ b/drivers/net/wireless/iwlwifi/iwl-hcmd.c @@ -28,6 +28,7 @@ #include #include +#include #include #include "iwl-dev.h" /* FIXME: remove */ diff --git a/drivers/net/wireless/iwlwifi/iwl-tx.c b/drivers/net/wireless/iwlwifi/iwl-tx.c index c18907544701..fb9bcfa6d947 100644 --- a/drivers/net/wireless/iwlwifi/iwl-tx.c +++ b/drivers/net/wireless/iwlwifi/iwl-tx.c @@ -28,6 +28,7 @@ *****************************************************************************/ #include +#include #include #include "iwl-eeprom.h" #include "iwl-dev.h" diff --git a/drivers/net/wireless/iwlwifi/iwl3945-base.c b/drivers/net/wireless/iwlwifi/iwl3945-base.c index c390dbd877e4..aa49230422f3 100644 --- a/drivers/net/wireless/iwlwifi/iwl3945-base.c +++ b/drivers/net/wireless/iwlwifi/iwl3945-base.c @@ -33,6 +33,7 @@ #include #include #include +#include #include #include #include diff --git a/drivers/net/wireless/iwmc3200wifi/cfg80211.c b/drivers/net/wireless/iwmc3200wifi/cfg80211.c index a56a2b0ac99a..f3c55658225b 100644 --- a/drivers/net/wireless/iwmc3200wifi/cfg80211.c +++ b/drivers/net/wireless/iwmc3200wifi/cfg80211.c @@ -23,6 +23,7 @@ #include #include +#include #include #include #include diff --git a/drivers/net/wireless/iwmc3200wifi/commands.c b/drivers/net/wireless/iwmc3200wifi/commands.c index 23b52fa2605f..84158b6d35d8 100644 --- a/drivers/net/wireless/iwmc3200wifi/commands.c +++ b/drivers/net/wireless/iwmc3200wifi/commands.c @@ -40,6 +40,7 @@ #include #include #include +#include #include "iwm.h" #include "bus.h" diff --git a/drivers/net/wireless/iwmc3200wifi/main.c b/drivers/net/wireless/iwmc3200wifi/main.c index d668e4756324..222eb2cf1b30 100644 --- a/drivers/net/wireless/iwmc3200wifi/main.c +++ b/drivers/net/wireless/iwmc3200wifi/main.c @@ -38,6 +38,7 @@ #include #include +#include #include #include diff --git a/drivers/net/wireless/iwmc3200wifi/rx.c b/drivers/net/wireless/iwmc3200wifi/rx.c index 40dbcbc16593..771a301003c9 100644 --- a/drivers/net/wireless/iwmc3200wifi/rx.c +++ b/drivers/net/wireless/iwmc3200wifi/rx.c @@ -38,6 +38,7 @@ #include #include +#include #include #include #include diff --git a/drivers/net/wireless/libertas/cmd.c b/drivers/net/wireless/libertas/cmd.c index 685098148e10..0a324dcd264c 100644 --- a/drivers/net/wireless/libertas/cmd.c +++ b/drivers/net/wireless/libertas/cmd.c @@ -6,6 +6,7 @@ #include #include #include +#include #include "host.h" #include "hostcmd.h" #include "decl.h" diff --git a/drivers/net/wireless/libertas/tx.c b/drivers/net/wireless/libertas/tx.c index 4c018f7a0a8d..8c3766a6e8e7 100644 --- a/drivers/net/wireless/libertas/tx.c +++ b/drivers/net/wireless/libertas/tx.c @@ -3,6 +3,7 @@ */ #include #include +#include #include "hostcmd.h" #include "radiotap.h" diff --git a/drivers/net/wireless/prism54/isl_ioctl.c b/drivers/net/wireless/prism54/isl_ioctl.c index 4c97c6ad6f5d..bc08464d8323 100644 --- a/drivers/net/wireless/prism54/isl_ioctl.c +++ b/drivers/net/wireless/prism54/isl_ioctl.c @@ -19,6 +19,7 @@ * */ +#include #include #include #include diff --git a/drivers/net/wireless/prism54/islpci_dev.c b/drivers/net/wireless/prism54/islpci_dev.c index e26d7b3ceab5..2505be56ae39 100644 --- a/drivers/net/wireless/prism54/islpci_dev.c +++ b/drivers/net/wireless/prism54/islpci_dev.c @@ -23,6 +23,7 @@ #include #include #include +#include #include #include #include diff --git a/drivers/net/wireless/prism54/islpci_mgt.c b/drivers/net/wireless/prism54/islpci_mgt.c index f7c677e2094d..69d2f882fd06 100644 --- a/drivers/net/wireless/prism54/islpci_mgt.c +++ b/drivers/net/wireless/prism54/islpci_mgt.c @@ -20,6 +20,7 @@ #include #include #include +#include #include #include diff --git a/drivers/net/wireless/rt2x00/rt2x00debug.c b/drivers/net/wireless/rt2x00/rt2x00debug.c index 7b3ee8c2eaef..68bc9bb1dbf9 100644 --- a/drivers/net/wireless/rt2x00/rt2x00debug.c +++ b/drivers/net/wireless/rt2x00/rt2x00debug.c @@ -27,6 +27,7 @@ #include #include #include +#include #include #include "rt2x00.h" diff --git a/drivers/pci/pcie/aer/aerdrv.c b/drivers/pci/pcie/aer/aerdrv.c index 2ce8f9ccc66e..d49ecc94bd49 100644 --- a/drivers/pci/pcie/aer/aerdrv.c +++ b/drivers/pci/pcie/aer/aerdrv.c @@ -17,6 +17,7 @@ #include #include +#include #include #include #include diff --git a/drivers/rtc/interface.c b/drivers/rtc/interface.c index 4cdb31a362ca..a0c816238aa9 100644 --- a/drivers/rtc/interface.c +++ b/drivers/rtc/interface.c @@ -12,6 +12,7 @@ */ #include +#include #include int rtc_read_time(struct rtc_device *rtc, struct rtc_time *tm) diff --git a/drivers/rtc/rtc-dev.c b/drivers/rtc/rtc-dev.c index 8a11de9552cd..62227cd52410 100644 --- a/drivers/rtc/rtc-dev.c +++ b/drivers/rtc/rtc-dev.c @@ -13,6 +13,7 @@ #include #include +#include #include "rtc-core.h" static dev_t rtc_devt; diff --git a/drivers/uio/uio.c b/drivers/uio/uio.c index a9d707047202..e941367dd28f 100644 --- a/drivers/uio/uio.c +++ b/drivers/uio/uio.c @@ -19,6 +19,7 @@ #include #include #include +#include #include #include #include diff --git a/drivers/uwb/whc-rc.c b/drivers/uwb/whc-rc.c index 1d9a6f54658e..01950c62dc8d 100644 --- a/drivers/uwb/whc-rc.c +++ b/drivers/uwb/whc-rc.c @@ -42,6 +42,7 @@ #include #include #include +#include #include #include #include diff --git a/fs/file.c b/fs/file.c index f313314f996f..87e129030ab1 100644 --- a/fs/file.c +++ b/fs/file.c @@ -10,6 +10,7 @@ #include #include #include +#include #include #include #include diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h index b78cf8194957..7ca72b74eec7 100644 --- a/include/linux/interrupt.h +++ b/include/linux/interrupt.h @@ -10,7 +10,6 @@ #include #include #include -#include #include #include #include @@ -610,6 +609,7 @@ extern void debug_poll_all_shared_irqs(void); static inline void debug_poll_all_shared_irqs(void) { } #endif +struct seq_file; int show_interrupts(struct seq_file *p, void *v); struct irq_desc; diff --git a/include/linux/mmc/host.h b/include/linux/mmc/host.h index 81bb42358595..eaf36364b7d4 100644 --- a/include/linux/mmc/host.h +++ b/include/linux/mmc/host.h @@ -11,6 +11,7 @@ #define LINUX_MMC_HOST_H #include +#include #include diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index a81cf80554db..17c71bb565c6 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c @@ -11,6 +11,7 @@ */ #include +#include #include #include #include diff --git a/kernel/mutex-debug.c b/kernel/mutex-debug.c index 50d022e5a560..ec815a960b5d 100644 --- a/kernel/mutex-debug.c +++ b/kernel/mutex-debug.c @@ -16,6 +16,7 @@ #include #include #include +#include #include #include #include diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index fb0f46fa1ecd..c3a4e2907eaa 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -13,6 +13,7 @@ #include #include #include +#include #include #include #include diff --git a/lib/debugobjects.c b/lib/debugobjects.c index 2755a3bd16a1..eae56fddfa3b 100644 --- a/lib/debugobjects.c +++ b/lib/debugobjects.c @@ -9,6 +9,7 @@ */ #include #include +#include #include #include #include diff --git a/lib/fault-inject.c b/lib/fault-inject.c index f97af55bdd96..7e65af70635e 100644 --- a/lib/fault-inject.c +++ b/lib/fault-inject.c @@ -1,6 +1,7 @@ #include #include #include +#include #include #include #include diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 5e7aed0802bf..0f551a4a44cd 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -12,6 +12,7 @@ #include #include #include +#include #include #include #include diff --git a/net/irda/ircomm/ircomm_tty_attach.c b/net/irda/ircomm/ircomm_tty_attach.c index eafc010907c2..3c1754023022 100644 --- a/net/irda/ircomm/ircomm_tty_attach.c +++ b/net/irda/ircomm/ircomm_tty_attach.c @@ -30,6 +30,7 @@ ********************************************************************/ #include +#include #include #include diff --git a/net/irda/irlan/irlan_common.c b/net/irda/irlan/irlan_common.c index 62116829b817..315ead3cb926 100644 --- a/net/irda/irlan/irlan_common.c +++ b/net/irda/irlan/irlan_common.c @@ -30,6 +30,7 @@ #include #include #include +#include #include #include #include diff --git a/net/irda/irlan/irlan_eth.c b/net/irda/irlan/irlan_eth.c index 7b6b631f647f..d340110f5c0c 100644 --- a/net/irda/irlan/irlan_eth.c +++ b/net/irda/irlan/irlan_eth.c @@ -30,6 +30,7 @@ #include #include #include +#include #include #include diff --git a/net/irda/irnet/irnet_irda.c b/net/irda/irnet/irnet_irda.c index cf9a4b531a98..cccc2e93234f 100644 --- a/net/irda/irnet/irnet_irda.c +++ b/net/irda/irnet/irnet_irda.c @@ -9,6 +9,7 @@ */ #include "irnet_irda.h" /* Private header */ +#include #include #include diff --git a/net/irda/irnet/irnet_ppp.c b/net/irda/irnet/irnet_ppp.c index 68cbcb19cbd8..7dea882dbb75 100644 --- a/net/irda/irnet/irnet_ppp.c +++ b/net/irda/irnet/irnet_ppp.c @@ -13,6 +13,7 @@ * 2) as a control channel (write commands, read events) */ +#include #include #include "irnet_ppp.h" /* Private header */ /* Please put other headers in irnet.h - Thanks */ diff --git a/net/mac80211/rc80211_pid_debugfs.c b/net/mac80211/rc80211_pid_debugfs.c index a59043fbb0ff..45667054a5f3 100644 --- a/net/mac80211/rc80211_pid_debugfs.c +++ b/net/mac80211/rc80211_pid_debugfs.c @@ -6,6 +6,7 @@ * published by the Free Software Foundation. */ +#include #include #include #include diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index 7c9ec3dee96e..ca6e68dcd8a8 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -14,6 +14,7 @@ #include #include #include +#include #include #include #include diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c index 0cf5e8c27a10..3fa5751af0ec 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_transport.c +++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c @@ -42,6 +42,7 @@ #include #include #include +#include #include #include #include diff --git a/net/wireless/core.c b/net/wireless/core.c index 45b2be3274db..a595f712b5bf 100644 --- a/net/wireless/core.c +++ b/net/wireless/core.c @@ -14,6 +14,7 @@ #include #include #include +#include #include #include #include "nl80211.h" -- cgit v1.2.3 From 15b812f1d0a5ca8f5efe7f5882f468af10682ca8 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Sun, 11 Oct 2009 14:17:16 -0700 Subject: pci: increase alignment to make more space for hidden code As reported in http://bugzilla.kernel.org/show_bug.cgi?id=13940 on some system when acpi are enabled, acpi clears some BAR for some devices without reason, and kernel will need to allocate devices for them. It then apparently hits some undocumented resource conflict, resulting in non-working devices. Try to increase alignment to get more safe range for unassigned devices. Signed-off-by: Yinghai Lu Signed-off-by: Linus Torvalds --- arch/x86/kernel/e820.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index 85419bb7d4ab..d17d482a04f4 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -1378,8 +1378,8 @@ static unsigned long ram_alignment(resource_size_t pos) if (mb < 16) return 1024*1024; - /* To 32MB for anything above that */ - return 32*1024*1024; + /* To 64MB for anything above that */ + return 64*1024*1024; } #define MAX_RESOURCE_SIZE ((resource_size_t)-1) -- cgit v1.2.3 From 9a821b231644028f8e2a853eb33d1184e925b183 Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Mon, 12 Oct 2009 12:59:29 +0100 Subject: x86: Move pci_iommu_init to rootfs_initcall() We want this to happen after the PCI quirks, which are now running at the very end of the fs_initcalls. This works around the BIOS problems which were originally addressed by commit db8be50c4307dac2b37305fc59c8dc0f978d09ea ('USB: Work around BIOS bugs by quiescing USB controllers earlier'), which was reverted in commit d93a8f829fe1d2f3002f2c6ddb553d12db420412. Signed-off-by: David Woodhouse --- arch/x86/kernel/pci-dma.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c index 64b838eac18c..e0d9199d0eb9 100644 --- a/arch/x86/kernel/pci-dma.c +++ b/arch/x86/kernel/pci-dma.c @@ -311,7 +311,7 @@ void pci_iommu_shutdown(void) amd_iommu_shutdown(); } /* Must execute after PCI subsystem */ -fs_initcall(pci_iommu_init); +rootfs_initcall(pci_iommu_init); #ifdef CONFIG_PCI /* Many VIA bridges seem to corrupt data for DAC. Disable it here */ -- cgit v1.2.3 From 7a4b7e5e741fe0a72a517b0367a2659aa53f7c44 Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Tue, 6 Oct 2009 16:32:43 +0100 Subject: x86: Fix Suspend to RAM freeze on Acer Aspire 1511Lmi laptop Move the trampoline and accessors back out of .cpuinit.* for the case of 64-bits+ACPI_SLEEP. This solves s2ram hangs reported in: http://bugzilla.kernel.org/show_bug.cgi?id=14279 Reported-and-bisected-by: Christian Casteyde Signed-off-by: Jan Beulich Cc: Cc: "Andrew Morton" Cc: "Rafael J. Wysocki" Signed-off-by: Ingo Molnar --- arch/x86/kernel/trampoline.c | 12 ++++++++++-- arch/x86/kernel/trampoline_64.S | 4 ++++ 2 files changed, 14 insertions(+), 2 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/trampoline.c b/arch/x86/kernel/trampoline.c index 699f7eeb896a..cd022121cab6 100644 --- a/arch/x86/kernel/trampoline.c +++ b/arch/x86/kernel/trampoline.c @@ -3,8 +3,16 @@ #include #include +#if defined(CONFIG_X86_64) && defined(CONFIG_ACPI_SLEEP) +#define __trampinit +#define __trampinitdata +#else +#define __trampinit __cpuinit +#define __trampinitdata __cpuinitdata +#endif + /* ready for x86_64 and x86 */ -unsigned char *__cpuinitdata trampoline_base = __va(TRAMPOLINE_BASE); +unsigned char *__trampinitdata trampoline_base = __va(TRAMPOLINE_BASE); void __init reserve_trampoline_memory(void) { @@ -26,7 +34,7 @@ void __init reserve_trampoline_memory(void) * bootstrap into the page concerned. The caller * has made sure it's suitably aligned. */ -unsigned long __cpuinit setup_trampoline(void) +unsigned long __trampinit setup_trampoline(void) { memcpy(trampoline_base, trampoline_data, TRAMPOLINE_SIZE); return virt_to_phys(trampoline_base); diff --git a/arch/x86/kernel/trampoline_64.S b/arch/x86/kernel/trampoline_64.S index 596d54c660a5..3af2dff58b21 100644 --- a/arch/x86/kernel/trampoline_64.S +++ b/arch/x86/kernel/trampoline_64.S @@ -32,8 +32,12 @@ #include #include +#ifdef CONFIG_ACPI_SLEEP +.section .rodata, "a", @progbits +#else /* We can free up the trampoline after bootup if cpu hotplug is not supported. */ __CPUINITRODATA +#endif .code16 ENTRY(trampoline_data) -- cgit v1.2.3 From d1705c558c95418378b11a0be963fe1b3e2fa381 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Mon, 12 Oct 2009 11:32:31 -0700 Subject: x86: fix kernel panic on 32 bits when profiling Latest kernel has a kernel panic in booting on i386 machine when profile=2 setting in cmdline. It is due to 'sp' being incorrect in profile_pc(). BUG: unable to handle kernel NULL pointer dereference at 00000246 IP: [] profile_pc+0x2a/0x48 *pde = 00000000 Oops: 0000 [#1] SMP This differs from the original version by Alex Shi in that we use the kernel_stack_pointer() inline already defined in for this purpose, instead of #ifdef. Originally-by: Alex Shi Cc: "Chen, Tim C" Cc: "Rafael J. Wysocki" Cc: Andrew Morton Signed-off-by: H. Peter Anvin --- arch/x86/kernel/time.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/time.c b/arch/x86/kernel/time.c index dcb00d278512..be2573448ed9 100644 --- a/arch/x86/kernel/time.c +++ b/arch/x86/kernel/time.c @@ -38,7 +38,8 @@ unsigned long profile_pc(struct pt_regs *regs) #ifdef CONFIG_FRAME_POINTER return *(unsigned long *)(regs->bp + sizeof(long)); #else - unsigned long *sp = (unsigned long *)regs->sp; + unsigned long *sp = + (unsigned long *)kernel_stack_pointer(regs); /* * Return address is either directly at stack pointer * or above a saved flags. Eflags has bits 22-31 zero, -- cgit v1.2.3 From 89ccf465abe6b20d804a63ae20307970c441369d Mon Sep 17 00:00:00 2001 From: Li Hong Date: Wed, 14 Oct 2009 18:50:39 +0800 Subject: x86, perf_event: Rename 'performance counter interrupt' In 'cdd6c482c9ff9c55475ee7392ec8f672eddb7be6', we renamed Performance Counters -> Performance Events. The name showed up in /proc/interrupts also needs a change. I use PMI (Performance monitoring interrupt) here, since it is the official name used in Intel's documents. Signed-off-by: Li Hong Cc: Andrew Morton Cc: Peter Zijlstra Cc: Paul Mackerras LKML-Reference: <20091014105039.GA22670@uhli> Signed-off-by: Ingo Molnar --- arch/x86/kernel/irq.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c index 391206199515..86fb2c8e065a 100644 --- a/arch/x86/kernel/irq.c +++ b/arch/x86/kernel/irq.c @@ -63,10 +63,10 @@ static int show_other_interrupts(struct seq_file *p, int prec) for_each_online_cpu(j) seq_printf(p, "%10u ", irq_stats(j)->irq_spurious_count); seq_printf(p, " Spurious interrupts\n"); - seq_printf(p, "%*s: ", prec, "CNT"); + seq_printf(p, "%*s: ", prec, "PMI"); for_each_online_cpu(j) seq_printf(p, "%10u ", irq_stats(j)->apic_perf_irqs); - seq_printf(p, " Performance counter interrupts\n"); + seq_printf(p, " Performance monitoring interrupts\n"); seq_printf(p, "%*s: ", prec, "PND"); for_each_online_cpu(j) seq_printf(p, "%10u ", irq_stats(j)->apic_pending_irqs); -- cgit v1.2.3 From e9a63a4e559fbdc522072281d05e6b13c1022f4b Mon Sep 17 00:00:00 2001 From: Roland McGrath Date: Wed, 14 Oct 2009 14:16:38 -0700 Subject: x86: linker script syntax nits The linker scripts grew some use of weirdly wrong linker script syntax. It happens to work, but it's not what the syntax is documented to be. Clean it up to use the official syntax. Signed-off-by: Roland McGrath CC: Ian Lance Taylor --- arch/x86/kernel/acpi/realmode/wakeup.lds.S | 4 ++-- arch/x86/kernel/vmlinux.lds.S | 17 ++++++++--------- 2 files changed, 10 insertions(+), 11 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/acpi/realmode/wakeup.lds.S b/arch/x86/kernel/acpi/realmode/wakeup.lds.S index 7da00b799cda..0e50e1e5c573 100644 --- a/arch/x86/kernel/acpi/realmode/wakeup.lds.S +++ b/arch/x86/kernel/acpi/realmode/wakeup.lds.S @@ -56,6 +56,6 @@ SECTIONS /DISCARD/ : { *(.note*) } - - . = ASSERT(_end <= WAKEUP_SIZE, "Wakeup too big!"); } + +ASSERT(_end <= WAKEUP_SIZE, "Wakeup too big!"); diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index 92929fb3f9fa..8d6001ad8d8d 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -305,8 +305,8 @@ SECTIONS #ifdef CONFIG_X86_32 -. = ASSERT((_end - LOAD_OFFSET <= KERNEL_IMAGE_SIZE), - "kernel image bigger than KERNEL_IMAGE_SIZE"); +ASSERT((_end - LOAD_OFFSET <= KERNEL_IMAGE_SIZE), + "kernel image bigger than KERNEL_IMAGE_SIZE"); #else /* * Per-cpu symbols which need to be offset from __per_cpu_load @@ -319,12 +319,12 @@ INIT_PER_CPU(irq_stack_union); /* * Build-time check on the image size: */ -. = ASSERT((_end - _text <= KERNEL_IMAGE_SIZE), - "kernel image bigger than KERNEL_IMAGE_SIZE"); +ASSERT((_end - _text <= KERNEL_IMAGE_SIZE), + "kernel image bigger than KERNEL_IMAGE_SIZE"); #ifdef CONFIG_SMP -. = ASSERT((per_cpu__irq_stack_union == 0), - "irq_stack_union is not at start of per-cpu area"); +ASSERT((per_cpu__irq_stack_union == 0), + "irq_stack_union is not at start of per-cpu area"); #endif #endif /* CONFIG_X86_32 */ @@ -332,7 +332,6 @@ INIT_PER_CPU(irq_stack_union); #ifdef CONFIG_KEXEC #include -. = ASSERT(kexec_control_code_size <= KEXEC_CONTROL_CODE_MAX_SIZE, - "kexec control code size is too big"); +ASSERT(kexec_control_code_size <= KEXEC_CONTROL_CODE_MAX_SIZE, + "kexec control code size is too big"); #endif - -- cgit v1.2.3 From db8590f5043f3436a65b24155a3a7af2604df876 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 15 Oct 2009 08:08:12 +0200 Subject: Revert "x86: linker script syntax nits" This reverts commit e9a63a4e559fbdc522072281d05e6b13c1022f4b. This breaks older binutils, where sink-less asserts are broken. See this commit for further details: d2ba8b2: x86: Fix assert syntax in vmlinux.lds.S Acked-by: "H. Peter Anvin" Acked-by: Sam Ravnborg Cc: Linus Torvalds LKML-Reference: <4AD6523D.5030909@zytor.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/acpi/realmode/wakeup.lds.S | 4 ++-- arch/x86/kernel/vmlinux.lds.S | 17 +++++++++-------- 2 files changed, 11 insertions(+), 10 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/acpi/realmode/wakeup.lds.S b/arch/x86/kernel/acpi/realmode/wakeup.lds.S index 0e50e1e5c573..7da00b799cda 100644 --- a/arch/x86/kernel/acpi/realmode/wakeup.lds.S +++ b/arch/x86/kernel/acpi/realmode/wakeup.lds.S @@ -56,6 +56,6 @@ SECTIONS /DISCARD/ : { *(.note*) } -} -ASSERT(_end <= WAKEUP_SIZE, "Wakeup too big!"); + . = ASSERT(_end <= WAKEUP_SIZE, "Wakeup too big!"); +} diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index 8d6001ad8d8d..92929fb3f9fa 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -305,8 +305,8 @@ SECTIONS #ifdef CONFIG_X86_32 -ASSERT((_end - LOAD_OFFSET <= KERNEL_IMAGE_SIZE), - "kernel image bigger than KERNEL_IMAGE_SIZE"); +. = ASSERT((_end - LOAD_OFFSET <= KERNEL_IMAGE_SIZE), + "kernel image bigger than KERNEL_IMAGE_SIZE"); #else /* * Per-cpu symbols which need to be offset from __per_cpu_load @@ -319,12 +319,12 @@ INIT_PER_CPU(irq_stack_union); /* * Build-time check on the image size: */ -ASSERT((_end - _text <= KERNEL_IMAGE_SIZE), - "kernel image bigger than KERNEL_IMAGE_SIZE"); +. = ASSERT((_end - _text <= KERNEL_IMAGE_SIZE), + "kernel image bigger than KERNEL_IMAGE_SIZE"); #ifdef CONFIG_SMP -ASSERT((per_cpu__irq_stack_union == 0), - "irq_stack_union is not at start of per-cpu area"); +. = ASSERT((per_cpu__irq_stack_union == 0), + "irq_stack_union is not at start of per-cpu area"); #endif #endif /* CONFIG_X86_32 */ @@ -332,6 +332,7 @@ ASSERT((per_cpu__irq_stack_union == 0), #ifdef CONFIG_KEXEC #include -ASSERT(kexec_control_code_size <= KEXEC_CONTROL_CODE_MAX_SIZE, - "kexec control code size is too big"); +. = ASSERT(kexec_control_code_size <= KEXEC_CONTROL_CODE_MAX_SIZE, + "kexec control code size is too big"); #endif + -- cgit v1.2.3 From a5912f6b3e20c137172460e6d4dd180866c00963 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 16 Oct 2009 07:18:46 +0200 Subject: x86: Document linker script ASSERT() quirk Older binutils breaks if ASSERT() is used without a sink for the output. For example 2.14.90.0.6 is known to be broken, the link fails with: LD .tmp_vmlinux1 ld:arch/x86/kernel/vmlinux.lds:678: parse error Document this quirk in all three files that use it. See: http://marc.info/?l=linux-kbuild&m=124930110427870&w=2 See[2]: d2ba8b2 ("x86: Fix assert syntax in vmlinux.lds.S") Cc: Linus Torvalds Cc: Roland McGrath Cc: "H. Peter Anvin" Cc: Sam Ravnborg LKML-Reference: <4AD6523D.5030909@zytor.com> Signed-off-by: Ingo Molnar --- arch/x86/boot/setup.ld | 3 +++ arch/x86/kernel/acpi/realmode/wakeup.lds.S | 3 +++ arch/x86/kernel/vmlinux.lds.S | 3 +++ 3 files changed, 9 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/boot/setup.ld b/arch/x86/boot/setup.ld index 0f6ec455a2b1..03c0683636b6 100644 --- a/arch/x86/boot/setup.ld +++ b/arch/x86/boot/setup.ld @@ -53,6 +53,9 @@ SECTIONS /DISCARD/ : { *(.note*) } + /* + * The ASSERT() sink to . is intentional, for binutils 2.14 compatibility: + */ . = ASSERT(_end <= 0x8000, "Setup too big!"); . = ASSERT(hdr == 0x1f1, "The setup header has the wrong offset!"); /* Necessary for the very-old-loader check to work... */ diff --git a/arch/x86/kernel/acpi/realmode/wakeup.lds.S b/arch/x86/kernel/acpi/realmode/wakeup.lds.S index 7da00b799cda..060fff8f5c5b 100644 --- a/arch/x86/kernel/acpi/realmode/wakeup.lds.S +++ b/arch/x86/kernel/acpi/realmode/wakeup.lds.S @@ -57,5 +57,8 @@ SECTIONS *(.note*) } + /* + * The ASSERT() sink to . is intentional, for binutils 2.14 compatibility: + */ . = ASSERT(_end <= WAKEUP_SIZE, "Wakeup too big!"); } diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index 92929fb3f9fa..3c68fe2d46cf 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -305,6 +305,9 @@ SECTIONS #ifdef CONFIG_X86_32 +/* + * The ASSERT() sink to . is intentional, for binutils 2.14 compatibility: + */ . = ASSERT((_end - LOAD_OFFSET <= KERNEL_IMAGE_SIZE), "kernel image bigger than KERNEL_IMAGE_SIZE"); #else -- cgit v1.2.3 From 036ed8ba61b72c19dc5759446d4fe0844aa88255 Mon Sep 17 00:00:00 2001 From: Robin Holt Date: Thu, 15 Oct 2009 17:40:00 -0500 Subject: x86, UV: Fix information in __uv_hub_info structure A few parts of the uv_hub_info structure are initialized incorrectly. - n_val is being loaded with m_val. - gpa_mask is initialized with a bytes instead of an unsigned long. - Handle the case where none of the alias registers are used. Lastly I converted the bau over to using the uv_hub_info->m_val which is the correct value. Without this patch, booting a large configuration hits a problem where the upper bits of the gnode affect the pnode and the bau will not operate. Signed-off-by: Robin Holt Acked-by: Jack Steiner Cc: Cliff Whickman Cc: stable@kernel.org LKML-Reference: <20091015224946.396355000@alcatraz.americas.sgi.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic/x2apic_uv_x.c | 8 ++++---- arch/x86/kernel/tlb_uv.c | 4 ++-- 2 files changed, 6 insertions(+), 6 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c index f5f5886a6b53..326c25477d3d 100644 --- a/arch/x86/kernel/apic/x2apic_uv_x.c +++ b/arch/x86/kernel/apic/x2apic_uv_x.c @@ -352,14 +352,14 @@ static __init void get_lowmem_redirect(unsigned long *base, unsigned long *size) for (i = 0; i < ARRAY_SIZE(redir_addrs); i++) { alias.v = uv_read_local_mmr(redir_addrs[i].alias); - if (alias.s.base == 0) { + if (alias.s.enable && alias.s.base == 0) { *size = (1UL << alias.s.m_alias); redirect.v = uv_read_local_mmr(redir_addrs[i].redirect); *base = (unsigned long)redirect.s.dest_base << DEST_SHIFT; return; } } - BUG(); + *base = *size = 0; } enum map_type {map_wb, map_uc}; @@ -619,12 +619,12 @@ void __init uv_system_init(void) uv_cpu_hub_info(cpu)->lowmem_remap_base = lowmem_redir_base; uv_cpu_hub_info(cpu)->lowmem_remap_top = lowmem_redir_size; uv_cpu_hub_info(cpu)->m_val = m_val; - uv_cpu_hub_info(cpu)->n_val = m_val; + uv_cpu_hub_info(cpu)->n_val = n_val; uv_cpu_hub_info(cpu)->numa_blade_id = blade; uv_cpu_hub_info(cpu)->blade_processor_id = lcpu; uv_cpu_hub_info(cpu)->pnode = pnode; uv_cpu_hub_info(cpu)->pnode_mask = pnode_mask; - uv_cpu_hub_info(cpu)->gpa_mask = (1 << (m_val + n_val)) - 1; + uv_cpu_hub_info(cpu)->gpa_mask = (1UL << (m_val + n_val)) - 1; uv_cpu_hub_info(cpu)->gnode_upper = gnode_upper; uv_cpu_hub_info(cpu)->gnode_extra = gnode_extra; uv_cpu_hub_info(cpu)->global_mmr_base = mmr_base; diff --git a/arch/x86/kernel/tlb_uv.c b/arch/x86/kernel/tlb_uv.c index 503c1f2e8835..f99fb6acfe34 100644 --- a/arch/x86/kernel/tlb_uv.c +++ b/arch/x86/kernel/tlb_uv.c @@ -843,8 +843,8 @@ static int __init uv_bau_init(void) GFP_KERNEL, cpu_to_node(cur_cpu)); uv_bau_retry_limit = 1; - uv_nshift = uv_hub_info->n_val; - uv_mmask = (1UL << uv_hub_info->n_val) - 1; + uv_nshift = uv_hub_info->m_val; + uv_mmask = (1UL << uv_hub_info->m_val) - 1; nblades = uv_num_possible_blades(); uv_bau_table_bases = (struct bau_control **) -- cgit v1.2.3 From 93ae5012a79b11e7fc855b52c7ce1e16fe1540b0 Mon Sep 17 00:00:00 2001 From: Roland Dreier Date: Thu, 15 Oct 2009 14:21:14 -0700 Subject: x86: Don't print number of MCE banks for every CPU The MCE initialization code explicitly says it doesn't handle asymmetric configurations where different CPUs support different numbers of MCE banks, and it prints a big warning in that case. Therefore, printing the "mce: CPU supports MCE banks" message into the kernel log for every CPU is pure redundancy that clutters the log significantly for systems with lots of CPUs. Signed-off-by: Roland Dreier LKML-Reference: Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/mcheck/mce.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index b1598a9436d0..721a77ca8115 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -1214,7 +1214,8 @@ static int __cpuinit mce_cap_init(void) rdmsrl(MSR_IA32_MCG_CAP, cap); b = cap & MCG_BANKCNT_MASK; - printk(KERN_INFO "mce: CPU supports %d MCE banks\n", b); + if (!banks) + printk(KERN_INFO "mce: CPU supports %d MCE banks\n", b); if (b > MAX_NR_BANKS) { printk(KERN_WARNING -- cgit v1.2.3 From 1d21e6e3ffad2939f9d8179817c6f9bc3b811b68 Mon Sep 17 00:00:00 2001 From: Robin Holt Date: Fri, 16 Oct 2009 06:29:20 -0500 Subject: x86, UV: Fix and clean up bau code to use uv_gpa_to_pnode() Create an inline function to extract the pnode from a global physical address and then convert the broadcast assist unit to use the newly created uv_gpa_to_pnode function. The open-coded code was wrong as well - it might explain a few of our unexplained bau hangs. Signed-off-by: Robin Holt Acked-by: Cliff Whickman Cc: linux-mm@kvack.org Cc: Jack Steiner LKML-Reference: <20091016112920.GZ8903@sgi.com> Signed-off-by: Ingo Molnar --- arch/x86/include/asm/uv/uv_hub.h | 16 +++++++++++++++- arch/x86/kernel/tlb_uv.c | 7 ++----- 2 files changed, 17 insertions(+), 6 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/include/asm/uv/uv_hub.h b/arch/x86/include/asm/uv/uv_hub.h index 04eb6c958b9d..94908a08020a 100644 --- a/arch/x86/include/asm/uv/uv_hub.h +++ b/arch/x86/include/asm/uv/uv_hub.h @@ -114,7 +114,7 @@ /* * The largest possible NASID of a C or M brick (+ 2) */ -#define UV_MAX_NASID_VALUE (UV_MAX_NUMALINK_NODES * 2) +#define UV_MAX_NASID_VALUE (UV_MAX_NUMALINK_BLADES * 2) struct uv_scir_s { struct timer_list timer; @@ -230,6 +230,20 @@ static inline unsigned long uv_gpa(void *v) return uv_soc_phys_ram_to_gpa(__pa(v)); } +/* gnode -> pnode */ +static inline unsigned long uv_gpa_to_gnode(unsigned long gpa) +{ + return gpa >> uv_hub_info->m_val; +} + +/* gpa -> pnode */ +static inline int uv_gpa_to_pnode(unsigned long gpa) +{ + unsigned long n_mask = (1UL << uv_hub_info->n_val) - 1; + + return uv_gpa_to_gnode(gpa) & n_mask; +} + /* pnode, offset --> socket virtual */ static inline void *uv_pnode_offset_to_vaddr(int pnode, unsigned long offset) { diff --git a/arch/x86/kernel/tlb_uv.c b/arch/x86/kernel/tlb_uv.c index f99fb6acfe34..1740c85e24bb 100644 --- a/arch/x86/kernel/tlb_uv.c +++ b/arch/x86/kernel/tlb_uv.c @@ -23,8 +23,6 @@ static struct bau_control **uv_bau_table_bases __read_mostly; static int uv_bau_retry_limit __read_mostly; -/* position of pnode (which is nasid>>1): */ -static int uv_nshift __read_mostly; /* base pnode in this partition */ static int uv_partition_base_pnode __read_mostly; @@ -723,7 +721,7 @@ uv_activation_descriptor_init(int node, int pnode) BUG_ON(!adp); pa = uv_gpa(adp); /* need the real nasid*/ - n = pa >> uv_nshift; + n = uv_gpa_to_pnode(pa); m = pa & uv_mmask; uv_write_global_mmr64(pnode, UVH_LB_BAU_SB_DESCRIPTOR_BASE, @@ -778,7 +776,7 @@ uv_payload_queue_init(int node, int pnode, struct bau_control *bau_tablesp) * need the pnode of where the memory was really allocated */ pa = uv_gpa(pqp); - pn = pa >> uv_nshift; + pn = uv_gpa_to_pnode(pa); uv_write_global_mmr64(pnode, UVH_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST, ((unsigned long)pn << UV_PAYLOADQ_PNODE_SHIFT) | @@ -843,7 +841,6 @@ static int __init uv_bau_init(void) GFP_KERNEL, cpu_to_node(cur_cpu)); uv_bau_retry_limit = 1; - uv_nshift = uv_hub_info->m_val; uv_mmask = (1UL << uv_hub_info->m_val) - 1; nblades = uv_num_possible_blades(); -- cgit v1.2.3 From 72ed7de74e8f0fad0d8e567ae1f987b740accb3f Mon Sep 17 00:00:00 2001 From: Jiri Slaby Date: Mon, 26 Oct 2009 11:11:43 +0100 Subject: x86: crash_dump: Fix non-pae kdump kernel memory accesses Non-PAE 32-bit dump kernels may wrap an address around 4G and poke unwanted space. ptes there are 32-bit long, and since pfn << PAGE_SIZE may exceed this limit, high pfn bits are cropped and wrong address mapped by kmap_atomic_pfn in copy_oldmem_page. Don't allow this behavior in non-PAE kdump kernels by checking pfns passed into copy_oldmem_page. In the case of failure, userspace process gets EFAULT. [v2] - fix comments - move ifdefs inside the function Signed-off-by: Jiri Slaby Cc: Vivek Goyal Cc: Eric W. Biederman Cc: Simon Horman Cc: Paul Mundt LKML-Reference: <1256551903-30567-1-git-send-email-jirislaby@gmail.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/crash_dump_32.c | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/crash_dump_32.c b/arch/x86/kernel/crash_dump_32.c index f7cdb3b457aa..cd97ce18c29d 100644 --- a/arch/x86/kernel/crash_dump_32.c +++ b/arch/x86/kernel/crash_dump_32.c @@ -16,6 +16,22 @@ static void *kdump_buf_page; /* Stores the physical address of elf header of crash image. */ unsigned long long elfcorehdr_addr = ELFCORE_ADDR_MAX; +static inline bool is_crashed_pfn_valid(unsigned long pfn) +{ +#ifndef CONFIG_X86_PAE + /* + * non-PAE kdump kernel executed from a PAE one will crop high pte + * bits and poke unwanted space counting again from address 0, we + * don't want that. pte must fit into unsigned long. In fact the + * test checks high 12 bits for being zero (pfn will be shifted left + * by PAGE_SHIFT). + */ + return pte_pfn(pfn_pte(pfn, __pgprot(0))) == pfn; +#else + return true; +#endif +} + /** * copy_oldmem_page - copy one page from "oldmem" * @pfn: page frame number to be copied @@ -41,6 +57,9 @@ ssize_t copy_oldmem_page(unsigned long pfn, char *buf, if (!csize) return 0; + if (!is_crashed_pfn_valid(pfn)) + return -EFAULT; + vaddr = kmap_atomic_pfn(pfn, KM_PTE0); if (!userbuf) { -- cgit v1.2.3 From 772be899bc022ef2b911c3611b487d417e3269c3 Mon Sep 17 00:00:00 2001 From: Feng Tang Date: Tue, 20 Oct 2009 12:54:02 +0800 Subject: x86: Make EFI RTC function depend on 32bit again The EFI RTC functions are only available on 32 bit. commit 7bd867df (x86: Move get/set_wallclock to x86_platform_ops) removed the 32bit dependency which leads to boot crashes on 64bit EFI systems. Add the dependency back. Solves: http://bugzilla.kernel.org/show_bug.cgi?id=14466 Tested-by: Matthew Garrett Signed-off-by: Feng Tang LKML-Reference: <20091020125402.028d66d5@feng-desktop> Signed-off-by: Thomas Gleixner --- arch/x86/kernel/efi.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/efi.c b/arch/x86/kernel/efi.c index ad5bd988fb79..cdcfb122f256 100644 --- a/arch/x86/kernel/efi.c +++ b/arch/x86/kernel/efi.c @@ -454,8 +454,10 @@ void __init efi_init(void) if (add_efi_memmap) do_add_efi_memmap(); +#ifdef CONFIG_X86_32 x86_platform.get_wallclock = efi_get_time; x86_platform.set_wallclock = efi_set_rtc_mmss; +#endif /* Setup for EFI runtime service */ reboot_type = BOOT_EFI; -- cgit v1.2.3 From ca0207114f1708b563f510b7781a360ec5b98359 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Wed, 28 Oct 2009 18:02:26 +0100 Subject: x86/amd-iommu: Un__init function required on shutdown The function iommu_feature_disable is required on system shutdown to disable the IOMMU but it is marked as __init. This may result in a panic if the memory is reused. This patch fixes this bug. Signed-off-by: Joerg Roedel --- arch/x86/kernel/amd_iommu_init.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c index 498c8c79aaa5..1e423b2add15 100644 --- a/arch/x86/kernel/amd_iommu_init.c +++ b/arch/x86/kernel/amd_iommu_init.c @@ -240,7 +240,7 @@ static void iommu_feature_enable(struct amd_iommu *iommu, u8 bit) writel(ctrl, iommu->mmio_base + MMIO_CONTROL_OFFSET); } -static void __init iommu_feature_disable(struct amd_iommu *iommu, u8 bit) +static void iommu_feature_disable(struct amd_iommu *iommu, u8 bit) { u32 ctrl; -- cgit v1.2.3 From 16121d70fdf9eeb05ead46b241a293156323dbbe Mon Sep 17 00:00:00 2001 From: Dave Jones Date: Sun, 1 Nov 2009 19:27:05 -0500 Subject: x86: Fix printk message typo in mtrr cleanup code Trivial typo. Signed-off-by: Dave Jones LKML-Reference: Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/mtrr/cleanup.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/mtrr/cleanup.c b/arch/x86/kernel/cpu/mtrr/cleanup.c index 315738c74aad..73c86db5acbe 100644 --- a/arch/x86/kernel/cpu/mtrr/cleanup.c +++ b/arch/x86/kernel/cpu/mtrr/cleanup.c @@ -846,7 +846,7 @@ int __init mtrr_cleanup(unsigned address_bits) sort(range, nr_range, sizeof(struct res_range), cmp_range, NULL); range_sums = sum_ranges(range, nr_range); - printk(KERN_INFO "total RAM coverred: %ldM\n", + printk(KERN_INFO "total RAM covered: %ldM\n", range_sums >> (20 - PAGE_SHIFT)); if (mtrr_chunk_size && mtrr_gran_size) { -- cgit v1.2.3 From 05154752cf3767c544b65b5e340793d40b3f1229 Mon Sep 17 00:00:00 2001 From: Gottfried Haider Date: Mon, 2 Nov 2009 11:51:11 +0100 Subject: x86: Add reboot quirk for 3 series Mac mini MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Reboot does not work out of the box on my "Early 2009" Mac mini (3,1). Detect this machine via DMI as we do for recent MacBooks. Signed-off-by: Gottfried Haider Cc: Ozan Çağlayan Cc: Paul Mackerras Signed-off-by: Ingo Molnar --- arch/x86/kernel/reboot.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c index a1a3cdda06e1..f93078746e00 100644 --- a/arch/x86/kernel/reboot.c +++ b/arch/x86/kernel/reboot.c @@ -436,6 +436,14 @@ static struct dmi_system_id __initdata pci_reboot_dmi_table[] = { DMI_MATCH(DMI_PRODUCT_NAME, "MacBookPro5"), }, }, + { /* Handle problems with rebooting on Apple Macmini3,1 */ + .callback = set_pci_reboot, + .ident = "Apple Macmini3,1", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Apple Inc."), + DMI_MATCH(DMI_PRODUCT_NAME, "Macmini3,1"), + }, + }, { } }; -- cgit v1.2.3 From 89240ba059ca468ae7a8346edf7f95082458c2fc Mon Sep 17 00:00:00 2001 From: Stefani Seibold Date: Tue, 3 Nov 2009 10:22:40 +0100 Subject: x86, fs: Fix x86 procfs stack information for threads on 64-bit This patch fixes two issues in the procfs stack information on x86-64 linux. The 32 bit loader compat_do_execve did not store stack start. (this was figured out by Alexey Dobriyan). The stack information on a x64_64 kernel always shows 0 kbyte stack usage, because of a missing implementation of the KSTK_ESP macro which always returned -1. The new implementation now returns the right value. Signed-off-by: Stefani Seibold Cc: Americo Wang Cc: Alexey Dobriyan Cc: Al Viro Cc: Andrew Morton LKML-Reference: <1257240160.4889.24.camel@wall-e> Signed-off-by: Ingo Molnar --- arch/x86/include/asm/processor.h | 2 +- arch/x86/kernel/process_64.c | 5 +++++ fs/compat.c | 2 ++ 3 files changed, 8 insertions(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index c3429e8b2424..c9786480f0fe 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -1000,7 +1000,7 @@ extern unsigned long thread_saved_pc(struct task_struct *tsk); #define thread_saved_pc(t) (*(unsigned long *)((t)->thread.sp - 8)) #define task_pt_regs(tsk) ((struct pt_regs *)(tsk)->thread.sp0 - 1) -#define KSTK_ESP(tsk) -1 /* sorry. doesn't work for syscall. */ +extern unsigned long KSTK_ESP(struct task_struct *task); #endif /* CONFIG_X86_64 */ extern void start_thread(struct pt_regs *regs, unsigned long new_ip, diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index ad535b683170..eb62cbcaa490 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -664,3 +664,8 @@ long sys_arch_prctl(int code, unsigned long addr) return do_arch_prctl(current, code, addr); } +unsigned long KSTK_ESP(struct task_struct *task) +{ + return (test_tsk_thread_flag(task, TIF_IA32)) ? + (task_pt_regs(task)->sp) : ((task)->thread.usersp); +} diff --git a/fs/compat.c b/fs/compat.c index d576b552e8e2..6c19040ffeef 100644 --- a/fs/compat.c +++ b/fs/compat.c @@ -1532,6 +1532,8 @@ int compat_do_execve(char * filename, if (retval < 0) goto out; + current->stack_start = current->mm->start_stack; + /* execve succeeded */ current->fs->in_exec = 0; current->in_execve = 0; -- cgit v1.2.3 From f1b291d4c47440cbfc1a478e88800e2742d60a80 Mon Sep 17 00:00:00 2001 From: Simon Kagstrom Date: Fri, 6 Nov 2009 15:44:04 +0100 Subject: x86: Add Phoenix/MSC BIOSes to lowmem corruption list We have a board with a Phoenix/MSC BIOS which also corrupts the low 64KB of RAM, so add an entry to the table. Signed-off-by: Simon Kagstrom LKML-Reference: <20091106154404.002648d9@marrow.netinsight.se> Signed-off-by: H. Peter Anvin --- arch/x86/kernel/setup.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index e09f0e2c14b5..2a34f9c5be21 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -659,6 +659,13 @@ static struct dmi_system_id __initdata bad_bios_dmi_table[] = { DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies"), }, }, + { + .callback = dmi_low_memory_corruption, + .ident = "Phoenix/MSC BIOS", + .matches = { + DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix/MSC"), + }, + }, { /* * AMI BIOS with low memory corruption was found on Intel DG45ID board. -- cgit v1.2.3 From eb647138acefc897c0eb6eddd5d3650966dfe627 Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Sun, 8 Nov 2009 12:12:14 +0100 Subject: x86/PCI: Adjust GFP mask handling for coherent allocations Rather than forcing GFP flags and DMA mask to be inconsistent, GFP flags should be determined even for the fallback device through dma_alloc_coherent_mask()/dma_alloc_coherent_gfp_flags(). This restores 64-bit behavior as it was prior to commits 8965eb19386fdf5ccd0ef8b02593eb8560aa3416 and 4a367f3a9dbf2e7ffcee4702203479809236ee6e (not sure why there are two of them), where GFP_DMA was forced on for 32-bit, but not for 64-bit, with the slight adjustment that afaict even 32-bit doesn't need this without CONFIG_ISA. Signed-off-by: Jan Beulich Acked-by: Takashi Iwai LKML-Reference: <4AF18187020000780001D8AA@vpn.id2.novell.com> Signed-off-by: Ingo Molnar Signed-off-by: Jesse Barnes --- arch/x86/include/asm/dma-mapping.h | 10 +++++++--- arch/x86/kernel/pci-dma.c | 6 ++---- 2 files changed, 9 insertions(+), 7 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/include/asm/dma-mapping.h b/arch/x86/include/asm/dma-mapping.h index 0ee770d23d0e..6a25d5d42836 100644 --- a/arch/x86/include/asm/dma-mapping.h +++ b/arch/x86/include/asm/dma-mapping.h @@ -14,6 +14,12 @@ #include #include +#ifdef CONFIG_ISA +# define ISA_DMA_BIT_MASK DMA_BIT_MASK(24) +#else +# define ISA_DMA_BIT_MASK DMA_BIT_MASK(32) +#endif + extern dma_addr_t bad_dma_address; extern int iommu_merge; extern struct device x86_dma_fallback_dev; @@ -124,10 +130,8 @@ dma_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_handle, if (dma_alloc_from_coherent(dev, size, dma_handle, &memory)) return memory; - if (!dev) { + if (!dev) dev = &x86_dma_fallback_dev; - gfp |= GFP_DMA; - } if (!is_device_dma_capable(dev)) return NULL; diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c index b2a71dca5642..a6e804d16c35 100644 --- a/arch/x86/kernel/pci-dma.c +++ b/arch/x86/kernel/pci-dma.c @@ -45,12 +45,10 @@ int iommu_pass_through __read_mostly; dma_addr_t bad_dma_address __read_mostly = 0; EXPORT_SYMBOL(bad_dma_address); -/* Dummy device used for NULL arguments (normally ISA). Better would - be probably a smaller DMA mask, but this is bug-to-bug compatible - to older i386. */ +/* Dummy device used for NULL arguments (normally ISA). */ struct device x86_dma_fallback_dev = { .init_name = "fallback device", - .coherent_dma_mask = DMA_BIT_MASK(32), + .coherent_dma_mask = ISA_DMA_BIT_MASK, .dma_mask = &x86_dma_fallback_dev.coherent_dma_mask, }; EXPORT_SYMBOL(x86_dma_fallback_dev); -- cgit v1.2.3 From 506f90eeae682dc96c11c7aefac0262b3a560b49 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Thu, 29 Oct 2009 14:45:52 +0100 Subject: x86, amd-ucode: Check UCODE_MAGIC before loading the container file Signed-off-by: Borislav Petkov Signed-off-by: Andreas Herrmann LKML-Reference: <20091029134552.GC30802@alberich.amd.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/microcode_amd.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/microcode_amd.c b/arch/x86/kernel/microcode_amd.c index 366baa179913..f4c538b681ca 100644 --- a/arch/x86/kernel/microcode_amd.c +++ b/arch/x86/kernel/microcode_amd.c @@ -317,6 +317,12 @@ static enum ucode_state request_microcode_fw(int cpu, struct device *device) return UCODE_NFOUND; } + if (*(u32 *)firmware->data != UCODE_MAGIC) { + printk(KERN_ERR "microcode: invalid UCODE_MAGIC (0x%08x)\n", + *(u32 *)firmware->data); + return UCODE_ERROR; + } + ret = generic_load_microcode(cpu, firmware->data, firmware->size); release_firmware(firmware); -- cgit v1.2.3 From f7f3cad06080f14f60b1453af94463ff68ea2739 Mon Sep 17 00:00:00 2001 From: Krzysztof Helt Date: Sat, 24 Oct 2009 17:25:38 +0200 Subject: [CPUFREQ] longhaul: select Longhaul version 2 for capable CPUs There is a typo in the longhaul detection code so only Longhaul v1 or Longhaul v3 is selected. The Longhaul v2 is not selected even for CPUs which are capable of. Tested on PCChips Giga Pro board. Frequency changes work and the Longhaul v2 detects that the board is not capable of changing CPU voltage. Signed-off-by: Krzysztof Helt Signed-off-by: Dave Jones --- arch/x86/kernel/cpu/cpufreq/longhaul.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/cpufreq/longhaul.c b/arch/x86/kernel/cpu/cpufreq/longhaul.c index ce2ed3e4aad9..cabd2fa3fc93 100644 --- a/arch/x86/kernel/cpu/cpufreq/longhaul.c +++ b/arch/x86/kernel/cpu/cpufreq/longhaul.c @@ -813,7 +813,7 @@ static int __init longhaul_cpu_init(struct cpufreq_policy *policy) memcpy(eblcr, samuel2_eblcr, sizeof(samuel2_eblcr)); break; case 1 ... 15: - longhaul_version = TYPE_LONGHAUL_V1; + longhaul_version = TYPE_LONGHAUL_V2; if (c->x86_mask < 8) { cpu_model = CPU_SAMUEL2; cpuname = "C3 'Samuel 2' [C5B]"; -- cgit v1.2.3 From c53614ec17fe6296a696aa4ac71a799814bb50c1 Mon Sep 17 00:00:00 2001 From: Roel Kluin Date: Tue, 6 Oct 2009 17:36:53 +0200 Subject: [CPUFREQ] powernow-k8: Fix test in get_transition_latency() Not makes it a bool before the comparison. Signed-off-by: Roel Kluin Signed-off-by: Dave Jones --- arch/x86/kernel/cpu/cpufreq/powernow-k8.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c index 6394aa5c7985..3f12dabeab52 100644 --- a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c +++ b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c @@ -1022,7 +1022,7 @@ static int get_transition_latency(struct powernow_k8_data *data) * set it to 1 to avoid problems in the future. * For all others it's a BIOS bug. */ - if (!boot_cpu_data.x86 == 0x11) + if (boot_cpu_data.x86 != 0x11) printk(KERN_ERR FW_WARN PFX "Invalid zero transition " "latency\n"); max_latency = 1; -- cgit v1.2.3 From 293afe44d75abce4252db76cbb303a7de4297ce1 Mon Sep 17 00:00:00 2001 From: John Villalovos Date: Fri, 25 Sep 2009 13:30:08 -0400 Subject: [CPUFREQ] acpi-cpufreq: blacklist Intel 0f68: Fix HT detection and put in notification message Removing the SMT/HT check, since the Errata doesn't mention Hyper-Threading. Adding in a printk, so that the user knows why acpi-cpufreq refuses to load. Also, once system is blacklisted, don't repeat checks to see if blacklisted. This also causes the message to only be printed once, rather than for each CPU. Signed-off-by: John L. Villalovos Signed-off-by: Dave Jones --- arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c | 23 +++++++++++++++++------ 1 file changed, 17 insertions(+), 6 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c index 7d5c3b0ea8da..8b581d3905cb 100644 --- a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c +++ b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c @@ -526,15 +526,21 @@ static const struct dmi_system_id sw_any_bug_dmi_table[] = { static int acpi_cpufreq_blacklist(struct cpuinfo_x86 *c) { - /* http://www.intel.com/Assets/PDF/specupdate/314554.pdf + /* Intel Xeon Processor 7100 Series Specification Update + * http://www.intel.com/Assets/PDF/specupdate/314554.pdf * AL30: A Machine Check Exception (MCE) Occurring during an * Enhanced Intel SpeedStep Technology Ratio Change May Cause - * Both Processor Cores to Lock Up when HT is enabled*/ + * Both Processor Cores to Lock Up. */ if (c->x86_vendor == X86_VENDOR_INTEL) { if ((c->x86 == 15) && (c->x86_model == 6) && - (c->x86_mask == 8) && smt_capable()) + (c->x86_mask == 8)) { + printk(KERN_INFO "acpi-cpufreq: Intel(R) " + "Xeon(R) 7100 Errata AL30, processors may " + "lock up on frequency changes: disabling " + "acpi-cpufreq.\n"); return -ENODEV; + } } return 0; } @@ -549,13 +555,18 @@ static int acpi_cpufreq_cpu_init(struct cpufreq_policy *policy) unsigned int result = 0; struct cpuinfo_x86 *c = &cpu_data(policy->cpu); struct acpi_processor_performance *perf; +#ifdef CONFIG_SMP + static int blacklisted; +#endif dprintk("acpi_cpufreq_cpu_init\n"); #ifdef CONFIG_SMP - result = acpi_cpufreq_blacklist(c); - if (result) - return result; + if (blacklisted) + return blacklisted; + blacklisted = acpi_cpufreq_blacklist(c); + if (blacklisted) + return blacklisted; #endif data = kzalloc(sizeof(struct acpi_cpufreq_data), GFP_KERNEL); -- cgit v1.2.3 From 8dca15e40889e5d5e9655b03ba79c26200f760ce Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Mon, 2 Nov 2009 23:35:30 -0800 Subject: [CPUFREQ] speedstep-ich: fix error caused by 394122ab144dae4b276d74644a2f11c44a60ac5c "[CPUFREQ] cpumask: avoid playing with cpus_allowed in speedstep-ich.c" changed the code to mistakenly pass the current cpu as the "processor" argument of speedstep_get_frequency(), whereas it should be the type of the processor. Addresses http://bugzilla.kernel.org/show_bug.cgi?id=14340 Based on a patch by Dave Mueller. Signed-off-by: Rusty Russell Acked-by: Dominik Brodowski Reported-by: Dave Mueller Cc: Signed-off-by: Andrew Morton Signed-off-by: Dave Jones --- arch/x86/kernel/cpu/cpufreq/speedstep-ich.c | 19 +++++++------------ 1 file changed, 7 insertions(+), 12 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c b/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c index 6911e91fb4f6..3ae5a7a3a500 100644 --- a/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c +++ b/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c @@ -232,28 +232,23 @@ static unsigned int speedstep_detect_chipset(void) return 0; } -struct get_freq_data { - unsigned int speed; - unsigned int processor; -}; - -static void get_freq_data(void *_data) +static void get_freq_data(void *_speed) { - struct get_freq_data *data = _data; + unsigned int *speed = _speed; - data->speed = speedstep_get_frequency(data->processor); + *speed = speedstep_get_frequency(speedstep_processor); } static unsigned int speedstep_get(unsigned int cpu) { - struct get_freq_data data = { .processor = cpu }; + unsigned int speed; /* You're supposed to ensure CPU is online. */ - if (smp_call_function_single(cpu, get_freq_data, &data, 1) != 0) + if (smp_call_function_single(cpu, get_freq_data, &speed, 1) != 0) BUG(); - dprintk("detected %u kHz as current frequency\n", data.speed); - return data.speed; + dprintk("detected %u kHz as current frequency\n", speed); + return speed; } /** -- cgit v1.2.3 From d77b81974521c82fa6fda38dfff1b491dcc62a32 Mon Sep 17 00:00:00 2001 From: Harald Welte Date: Tue, 24 Nov 2009 16:53:00 +0100 Subject: [CPUFREQ] Enable ACPI PDC handshake for VIA/Centaur CPUs In commit 0de51088e6a82bc8413d3ca9e28bbca2788b5b53, we introduced the use of acpi-cpufreq on VIA/Centaur CPU's by removing a vendor check for VENDOR_INTEL. However, as it turns out, at least the Nano CPU's also need the PDC (processor driver capabilities) handshake in order to activate the methods required for acpi-cpufreq. Since arch_acpi_processor_init_pdc() contains another vendor check for Intel, the PDC is not initialized on VIA CPU's. The resulting behavior of a current mainline kernel on such systems is: acpi-cpufreq loads and it indicates CPU frequency changes. However, the CPU stays at a single frequency This trivial patch ensures that init_intel_pdc() is called on Intel and VIA/Centaur CPU's alike. Signed-off-by: Harald Welte Signed-off-by: Dave Jones --- arch/x86/kernel/acpi/processor.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/acpi/processor.c b/arch/x86/kernel/acpi/processor.c index d296f4a195c9..d85d1b2432ba 100644 --- a/arch/x86/kernel/acpi/processor.c +++ b/arch/x86/kernel/acpi/processor.c @@ -79,7 +79,8 @@ void arch_acpi_processor_init_pdc(struct acpi_processor *pr) struct cpuinfo_x86 *c = &cpu_data(pr->id); pr->pdc = NULL; - if (c->x86_vendor == X86_VENDOR_INTEL) + if (c->x86_vendor == X86_VENDOR_INTEL || + c->x86_vendor == X86_VENDOR_CENTAUR) init_intel_pdc(pr, c); return; -- cgit v1.2.3