From 405d967dc70002991f8fc35c20e0d3cbc7614f63 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 24 Jun 2009 15:13:38 +0900 Subject: linker script: throw away .discard section x86 throws away .discard section but no other archs do. Also, .discard is not thrown away while linking modules. Make every arch and module linking throw it away. This will be used to define dummy variables for percpu declarations and definitions. This patch is based on Ivan Kokshaysky's alpha percpu patch. [ Impact: always throw away everything in .discard ] Signed-off-by: Tejun Heo Cc: Ivan Kokshaysky Cc: Richard Henderson Cc: Russell King Cc: Haavard Skinnemoen Cc: Bryan Wu Cc: Mikael Starvik Cc: Jesper Nilsson Cc: David Howells Cc: Yoshinori Sato Cc: Tony Luck Cc: Hirokazu Takata Cc: Geert Uytterhoeven Cc: Michal Simek Cc: Ralf Baechle Cc: Kyle McMartin Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Martin Schwidefsky Cc: Heiko Carstens Cc: Paul Mundt Cc: David S. Miller Cc: Jeff Dike Cc: Chris Zankel Cc: Rusty Russell Cc: Ingo Molnar --- arch/arm/kernel/vmlinux.lds.S | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/arm/kernel') diff --git a/arch/arm/kernel/vmlinux.lds.S b/arch/arm/kernel/vmlinux.lds.S index 6c0779792546..e256c57b8981 100644 --- a/arch/arm/kernel/vmlinux.lds.S +++ b/arch/arm/kernel/vmlinux.lds.S @@ -82,6 +82,7 @@ SECTIONS EXIT_TEXT EXIT_DATA *(.exitcall.exit) + *(.discard) *(.ARM.exidx.exit.text) *(.ARM.extab.exit.text) #ifndef CONFIG_MMU -- cgit v1.2.3 From 02239f0a4264608686cc0015d906c7b2dead89df Mon Sep 17 00:00:00 2001 From: Hartley Sweeten Date: Wed, 8 Jul 2009 02:00:49 +0100 Subject: [ARM] 5577/2: ep93xx: syscon locked register functions Add core functions to handle writes to the ep93xx software locked registers. There are a number of registers in the EP93xx System Controller that require a write to the software lock register before they can be updated. This patch adds a number of exported functions to the ep93xx core that handle this access. The software locked clock divider registers, VidClkDiv, MIRClkDiv, I2SClkDiv and KeyTchClkDiv would typically involve writing a specific value to the register. To support this the ep93xx_syscon_swlocked_write() function is provided. For the DeviceCfg register it's more typical to only need to set or clear a single bit. A generic ep93xx_devcfg_set_clear() function is provided to handle both operations. Two inline functions, ep93xx_devcfg_set_bits() and ep93xx_devcfg_clear_bits() are also provided to improve code readability. In addition, the remaining bits in the System Controller Device Config Register have been documented and the previously defined names shortened. All code paths that use this functionality have been updated except for arch/arm/kernel/crunch.c. That code is in a context switch path, which is not reentrant, so it is safe against itself. Cc: Lennert Buytenhek Cc: Matthias Kaehlcke Signed-off-by: H Hartley Sweeten Acked-by: Ryan Mallon Signed-off-by: Russell King --- arch/arm/kernel/crunch.c | 13 +++++-- arch/arm/mach-ep93xx/clock.c | 24 +++++++----- arch/arm/mach-ep93xx/core.c | 52 ++++++++++++++++++++----- arch/arm/mach-ep93xx/include/mach/ep93xx-regs.h | 36 ++++++++++++++--- arch/arm/mach-ep93xx/include/mach/platform.h | 16 ++++++++ arch/arm/mach-ep93xx/include/mach/system.h | 12 +++--- 6 files changed, 118 insertions(+), 35 deletions(-) (limited to 'arch/arm/kernel') diff --git a/arch/arm/kernel/crunch.c b/arch/arm/kernel/crunch.c index 99995c2b2312..769abe15cf91 100644 --- a/arch/arm/kernel/crunch.c +++ b/arch/arm/kernel/crunch.c @@ -31,7 +31,7 @@ void crunch_task_release(struct thread_info *thread) static int crunch_enabled(u32 devcfg) { - return !!(devcfg & EP93XX_SYSCON_DEVICE_CONFIG_CRUNCH_ENABLE); + return !!(devcfg & EP93XX_SYSCON_DEVCFG_CPENA); } static int crunch_do(struct notifier_block *self, unsigned long cmd, void *t) @@ -56,11 +56,16 @@ static int crunch_do(struct notifier_block *self, unsigned long cmd, void *t) break; case THREAD_NOTIFY_SWITCH: - devcfg = __raw_readl(EP93XX_SYSCON_DEVICE_CONFIG); + devcfg = __raw_readl(EP93XX_SYSCON_DEVCFG); if (crunch_enabled(devcfg) || crunch_owner == crunch_state) { - devcfg ^= EP93XX_SYSCON_DEVICE_CONFIG_CRUNCH_ENABLE; + /* + * We don't use ep93xx_syscon_swlocked_write() here + * because we are on the context switch path and + * preemption is already disabled. + */ + devcfg ^= EP93XX_SYSCON_DEVCFG_CPENA; __raw_writel(0xaa, EP93XX_SYSCON_SWLOCK); - __raw_writel(devcfg, EP93XX_SYSCON_DEVICE_CONFIG); + __raw_writel(devcfg, EP93XX_SYSCON_DEVCFG); } break; } diff --git a/arch/arm/mach-ep93xx/clock.c b/arch/arm/mach-ep93xx/clock.c index 6c4c1633ed12..c7642acfd022 100644 --- a/arch/arm/mach-ep93xx/clock.c +++ b/arch/arm/mach-ep93xx/clock.c @@ -50,20 +50,20 @@ static unsigned long get_uart_rate(struct clk *clk); static struct clk clk_uart1 = { .sw_locked = 1, - .enable_reg = EP93XX_SYSCON_DEVICE_CONFIG, - .enable_mask = EP93XX_SYSCON_DEVICE_CONFIG_U1EN, + .enable_reg = EP93XX_SYSCON_DEVCFG, + .enable_mask = EP93XX_SYSCON_DEVCFG_U1EN, .get_rate = get_uart_rate, }; static struct clk clk_uart2 = { .sw_locked = 1, - .enable_reg = EP93XX_SYSCON_DEVICE_CONFIG, - .enable_mask = EP93XX_SYSCON_DEVICE_CONFIG_U2EN, + .enable_reg = EP93XX_SYSCON_DEVCFG, + .enable_mask = EP93XX_SYSCON_DEVCFG_U2EN, .get_rate = get_uart_rate, }; static struct clk clk_uart3 = { .sw_locked = 1, - .enable_reg = EP93XX_SYSCON_DEVICE_CONFIG, - .enable_mask = EP93XX_SYSCON_DEVICE_CONFIG_U3EN, + .enable_reg = EP93XX_SYSCON_DEVCFG, + .enable_mask = EP93XX_SYSCON_DEVCFG_U3EN, .get_rate = get_uart_rate, }; static struct clk clk_pll1; @@ -160,9 +160,11 @@ int clk_enable(struct clk *clk) u32 value; value = __raw_readl(clk->enable_reg); + value |= clk->enable_mask; if (clk->sw_locked) - __raw_writel(0xaa, EP93XX_SYSCON_SWLOCK); - __raw_writel(value | clk->enable_mask, clk->enable_reg); + ep93xx_syscon_swlocked_write(value, clk->enable_reg); + else + __raw_writel(value, clk->enable_reg); } return 0; @@ -175,9 +177,11 @@ void clk_disable(struct clk *clk) u32 value; value = __raw_readl(clk->enable_reg); + value &= ~clk->enable_mask; if (clk->sw_locked) - __raw_writel(0xaa, EP93XX_SYSCON_SWLOCK); - __raw_writel(value & ~clk->enable_mask, clk->enable_reg); + ep93xx_syscon_swlocked_write(value, clk->enable_reg); + else + __raw_writel(value, clk->enable_reg); } } EXPORT_SYMBOL(clk_disable); diff --git a/arch/arm/mach-ep93xx/core.c b/arch/arm/mach-ep93xx/core.c index 204dc5cbd0b8..9399d3af9906 100644 --- a/arch/arm/mach-ep93xx/core.c +++ b/arch/arm/mach-ep93xx/core.c @@ -384,6 +384,47 @@ void __init ep93xx_init_irq(void) } +/************************************************************************* + * EP93xx System Controller Software Locked register handling + *************************************************************************/ + +/* + * syscon_swlock prevents anything else from writing to the syscon + * block while a software locked register is being written. + */ +static DEFINE_SPINLOCK(syscon_swlock); + +void ep93xx_syscon_swlocked_write(unsigned int val, unsigned int reg) +{ + unsigned long flags; + + spin_lock_irqsave(&syscon_swlock, flags); + + __raw_writel(0xaa, EP93XX_SYSCON_SWLOCK); + __raw_writel(val, reg); + + spin_unlock_irqrestore(&syscon_swlock, flags); +} +EXPORT_SYMBOL(ep93xx_syscon_swlocked_write); + +void ep93xx_devcfg_set_clear(unsigned int set_bits, unsigned int clear_bits) +{ + unsigned long flags; + unsigned int val; + + spin_lock_irqsave(&syscon_swlock, flags); + + val = __raw_readl(EP93XX_SYSCON_DEVCFG); + val |= set_bits; + val &= ~clear_bits; + __raw_writel(0xaa, EP93XX_SYSCON_SWLOCK); + __raw_writel(val, EP93XX_SYSCON_DEVCFG); + + spin_unlock_irqrestore(&syscon_swlock, flags); +} +EXPORT_SYMBOL(ep93xx_devcfg_set_clear); + + /************************************************************************* * EP93xx peripheral handling *************************************************************************/ @@ -550,15 +591,8 @@ extern void ep93xx_gpio_init(void); void __init ep93xx_init_devices(void) { - unsigned int v; - - /* - * Disallow access to MaverickCrunch initially. - */ - v = __raw_readl(EP93XX_SYSCON_DEVICE_CONFIG); - v &= ~EP93XX_SYSCON_DEVICE_CONFIG_CRUNCH_ENABLE; - __raw_writel(0xaa, EP93XX_SYSCON_SWLOCK); - __raw_writel(v, EP93XX_SYSCON_DEVICE_CONFIG); + /* Disallow access to MaverickCrunch initially */ + ep93xx_devcfg_clear_bits(EP93XX_SYSCON_DEVCFG_CPENA); ep93xx_gpio_init(); diff --git a/arch/arm/mach-ep93xx/include/mach/ep93xx-regs.h b/arch/arm/mach-ep93xx/include/mach/ep93xx-regs.h index 49b256b3ddfc..087d22b8f351 100644 --- a/arch/arm/mach-ep93xx/include/mach/ep93xx-regs.h +++ b/arch/arm/mach-ep93xx/include/mach/ep93xx-regs.h @@ -175,11 +175,37 @@ #define EP93XX_SYSCON_STANDBY EP93XX_SYSCON_REG(0x0c) #define EP93XX_SYSCON_CLOCK_SET1 EP93XX_SYSCON_REG(0x20) #define EP93XX_SYSCON_CLOCK_SET2 EP93XX_SYSCON_REG(0x24) -#define EP93XX_SYSCON_DEVICE_CONFIG EP93XX_SYSCON_REG(0x80) -#define EP93XX_SYSCON_DEVICE_CONFIG_U3EN (1<<24) -#define EP93XX_SYSCON_DEVICE_CONFIG_CRUNCH_ENABLE (1<<23) -#define EP93XX_SYSCON_DEVICE_CONFIG_U2EN (1<<20) -#define EP93XX_SYSCON_DEVICE_CONFIG_U1EN (1<<18) +#define EP93XX_SYSCON_DEVCFG EP93XX_SYSCON_REG(0x80) +#define EP93XX_SYSCON_DEVCFG_SWRST (1<<31) +#define EP93XX_SYSCON_DEVCFG_D1ONG (1<<30) +#define EP93XX_SYSCON_DEVCFG_D0ONG (1<<29) +#define EP93XX_SYSCON_DEVCFG_IONU2 (1<<28) +#define EP93XX_SYSCON_DEVCFG_GONK (1<<27) +#define EP93XX_SYSCON_DEVCFG_TONG (1<<26) +#define EP93XX_SYSCON_DEVCFG_MONG (1<<25) +#define EP93XX_SYSCON_DEVCFG_U3EN (1<<24) +#define EP93XX_SYSCON_DEVCFG_CPENA (1<<23) +#define EP93XX_SYSCON_DEVCFG_A2ONG (1<<22) +#define EP93XX_SYSCON_DEVCFG_A1ONG (1<<21) +#define EP93XX_SYSCON_DEVCFG_U2EN (1<<20) +#define EP93XX_SYSCON_DEVCFG_EXVC (1<<19) +#define EP93XX_SYSCON_DEVCFG_U1EN (1<<18) +#define EP93XX_SYSCON_DEVCFG_TIN (1<<17) +#define EP93XX_SYSCON_DEVCFG_HC3IN (1<<15) +#define EP93XX_SYSCON_DEVCFG_HC3EN (1<<14) +#define EP93XX_SYSCON_DEVCFG_HC1IN (1<<13) +#define EP93XX_SYSCON_DEVCFG_HC1EN (1<<12) +#define EP93XX_SYSCON_DEVCFG_HONIDE (1<<11) +#define EP93XX_SYSCON_DEVCFG_GONIDE (1<<10) +#define EP93XX_SYSCON_DEVCFG_PONG (1<<9) +#define EP93XX_SYSCON_DEVCFG_EONIDE (1<<8) +#define EP93XX_SYSCON_DEVCFG_I2SONSSP (1<<7) +#define EP93XX_SYSCON_DEVCFG_I2SONAC97 (1<<6) +#define EP93XX_SYSCON_DEVCFG_RASONP3 (1<<4) +#define EP93XX_SYSCON_DEVCFG_RAS (1<<3) +#define EP93XX_SYSCON_DEVCFG_ADCPD (1<<2) +#define EP93XX_SYSCON_DEVCFG_KEYS (1<<1) +#define EP93XX_SYSCON_DEVCFG_SHENA (1<<0) #define EP93XX_SYSCON_SWLOCK EP93XX_SYSCON_REG(0xc0) #define EP93XX_WATCHDOG_BASE EP93XX_APB_IOMEM(0x00140000) diff --git a/arch/arm/mach-ep93xx/include/mach/platform.h b/arch/arm/mach-ep93xx/include/mach/platform.h index 05f0f4f2f3ce..fb5e59a3ea04 100644 --- a/arch/arm/mach-ep93xx/include/mach/platform.h +++ b/arch/arm/mach-ep93xx/include/mach/platform.h @@ -15,8 +15,24 @@ struct ep93xx_eth_data void ep93xx_map_io(void); void ep93xx_init_irq(void); void ep93xx_init_time(unsigned long); + +/* EP93xx System Controller software locked register write */ +void ep93xx_syscon_swlocked_write(unsigned int val, unsigned int reg); +void ep93xx_devcfg_set_clear(unsigned int set_bits, unsigned int clear_bits); + +static inline void ep93xx_devcfg_set_bits(unsigned int bits) +{ + ep93xx_devcfg_set_clear(bits, 0x00); +} + +static inline void ep93xx_devcfg_clear_bits(unsigned int bits) +{ + ep93xx_devcfg_set_clear(0x00, bits); +} + void ep93xx_register_eth(struct ep93xx_eth_data *data, int copy_addr); void ep93xx_register_i2c(struct i2c_board_info *devices, int num); + void ep93xx_init_devices(void); extern struct sys_timer ep93xx_timer; diff --git a/arch/arm/mach-ep93xx/include/mach/system.h b/arch/arm/mach-ep93xx/include/mach/system.h index ed8f35e4f068..6d661fe9d66c 100644 --- a/arch/arm/mach-ep93xx/include/mach/system.h +++ b/arch/arm/mach-ep93xx/include/mach/system.h @@ -11,15 +11,13 @@ static inline void arch_idle(void) static inline void arch_reset(char mode, const char *cmd) { - u32 devicecfg; - local_irq_disable(); - devicecfg = __raw_readl(EP93XX_SYSCON_DEVICE_CONFIG); - __raw_writel(0xaa, EP93XX_SYSCON_SWLOCK); - __raw_writel(devicecfg | 0x80000000, EP93XX_SYSCON_DEVICE_CONFIG); - __raw_writel(0xaa, EP93XX_SYSCON_SWLOCK); - __raw_writel(devicecfg & ~0x80000000, EP93XX_SYSCON_DEVICE_CONFIG); + /* + * Set then clear the SWRST bit to initiate a software reset + */ + ep93xx_devcfg_set_bits(EP93XX_SYSCON_DEVCFG_SWRST); + ep93xx_devcfg_clear_bits(EP93XX_SYSCON_DEVCFG_SWRST); while (1) ; -- cgit v1.2.3 From 4bf1fa5a34aa2dd0d2cc58f0fc213a2e22d007a4 Mon Sep 17 00:00:00 2001 From: Uwe Kleine-König Date: Tue, 21 Jul 2009 09:56:27 +0100 Subject: [ARM] 5613/1: implement CALLER_ADDRESSx MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit From: Uwe Kleine-König As __builtin_return_address(n) doesn't work for ARM with n > 0, the kernel needs its own implementation. This fixes many warnings saying: warning: unsupported argument to '__builtin_return_address' The new methods and walk_stackframe must not be instrumented because CALLER_ADDRESSx is used in the various tracers and tracing the tracer is a bad idea. What's currently missing is an implementation using unwind tables. This is not fatal though, it's just that the tracers don't get enough information to be really useful. Note that if both ARM_UNWIND and FRAME_POINTER are enabled, walk_stackframe uses unwind information. So in this case the same implementation is used as when FRAME_POINTER is disabled. Cc: Catalin Marinas Signed-off-by: Uwe Kleine-König Signed-off-by: Russell King --- arch/arm/include/asm/ftrace.h | 34 +++++++++++++++++++ arch/arm/kernel/Makefile | 4 ++- arch/arm/kernel/return_address.c | 71 ++++++++++++++++++++++++++++++++++++++++ arch/arm/kernel/stacktrace.c | 4 +-- 4 files changed, 110 insertions(+), 3 deletions(-) create mode 100644 arch/arm/kernel/return_address.c (limited to 'arch/arm/kernel') diff --git a/arch/arm/include/asm/ftrace.h b/arch/arm/include/asm/ftrace.h index 39c8bc1a006a..d74265cffd86 100644 --- a/arch/arm/include/asm/ftrace.h +++ b/arch/arm/include/asm/ftrace.h @@ -11,4 +11,38 @@ extern void mcount(void); #endif +#ifndef __ASSEMBLY__ + +#if defined(CONFIG_FRAME_POINTER) && !defined(CONFIG_ARM_UNWIND) +/* + * return_address uses walk_stackframe to do it's work. If both + * CONFIG_FRAME_POINTER=y and CONFIG_ARM_UNWIND=y walk_stackframe uses unwind + * information. For this to work in the function tracer many functions would + * have to be marked with __notrace. So for now just depend on + * !CONFIG_ARM_UNWIND. + */ + +void *return_address(unsigned int); + +#else + +extern inline void *return_address(unsigned int level) +{ + return NULL; +} + +#endif + +#define HAVE_ARCH_CALLER_ADDR + +#define CALLER_ADDR0 ((unsigned long)__builtin_return_address(0)) +#define CALLER_ADDR1 ((unsigned long)return_address(1)) +#define CALLER_ADDR2 ((unsigned long)return_address(2)) +#define CALLER_ADDR3 ((unsigned long)return_address(3)) +#define CALLER_ADDR4 ((unsigned long)return_address(4)) +#define CALLER_ADDR5 ((unsigned long)return_address(5)) +#define CALLER_ADDR6 ((unsigned long)return_address(6)) + +#endif /* ifndef __ASSEMBLY__ */ + #endif /* _ASM_ARM_FTRACE */ diff --git a/arch/arm/kernel/Makefile b/arch/arm/kernel/Makefile index ff89d0b3abc5..3213c9382b17 100644 --- a/arch/arm/kernel/Makefile +++ b/arch/arm/kernel/Makefile @@ -8,10 +8,12 @@ ifdef CONFIG_DYNAMIC_FTRACE CFLAGS_REMOVE_ftrace.o = -pg endif +CFLAGS_REMOVE_return_address.o = -pg + # Object file lists. obj-y := compat.o elf.o entry-armv.o entry-common.o irq.o \ - process.o ptrace.o setup.o signal.o \ + process.o ptrace.o return_address.o setup.o signal.o \ sys_arm.o stacktrace.o time.o traps.o obj-$(CONFIG_ISA_DMA_API) += dma.o diff --git a/arch/arm/kernel/return_address.c b/arch/arm/kernel/return_address.c new file mode 100644 index 000000000000..df246da4ceca --- /dev/null +++ b/arch/arm/kernel/return_address.c @@ -0,0 +1,71 @@ +/* + * arch/arm/kernel/return_address.c + * + * Copyright (C) 2009 Uwe Kleine-Koenig + * for Pengutronix + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published by + * the Free Software Foundation. + */ +#include + +#if defined(CONFIG_FRAME_POINTER) && !defined(CONFIG_ARM_UNWIND) +#include + +#include + +struct return_address_data { + unsigned int level; + void *addr; +}; + +static int save_return_addr(struct stackframe *frame, void *d) +{ + struct return_address_data *data = d; + + if (!data->level) { + data->addr = (void *)frame->lr; + + return 1; + } else { + --data->level; + return 0; + } +} + +void *return_address(unsigned int level) +{ + struct return_address_data data; + struct stackframe frame; + register unsigned long current_sp asm ("sp"); + + data.level = level + 1; + + frame.fp = (unsigned long)__builtin_frame_address(0); + frame.sp = current_sp; + frame.lr = (unsigned long)__builtin_return_address(0); + frame.pc = (unsigned long)return_address; + + walk_stackframe(&frame, save_return_addr, &data); + + if (!data.level) + return data.addr; + else + return NULL; +} + +#else /* if defined(CONFIG_FRAME_POINTER) && !defined(CONFIG_ARM_UNWIND) */ + +#if defined(CONFIG_ARM_UNWIND) +#warning "TODO: return_address should use unwind tables" +#endif + +void *return_address(unsigned int level) +{ + return NULL; +} + +#endif /* if defined(CONFIG_FRAME_POINTER) && !defined(CONFIG_ARM_UNWIND) / else */ + +EXPORT_SYMBOL_GPL(return_address); diff --git a/arch/arm/kernel/stacktrace.c b/arch/arm/kernel/stacktrace.c index 9f444e5cc165..20b7411e47fd 100644 --- a/arch/arm/kernel/stacktrace.c +++ b/arch/arm/kernel/stacktrace.c @@ -21,7 +21,7 @@ * Note that with framepointer enabled, even the leaf functions have the same * prologue and epilogue, therefore we can ignore the LR value in this case. */ -int unwind_frame(struct stackframe *frame) +int notrace unwind_frame(struct stackframe *frame) { unsigned long high, low; unsigned long fp = frame->fp; @@ -43,7 +43,7 @@ int unwind_frame(struct stackframe *frame) } #endif -void walk_stackframe(struct stackframe *frame, +void notrace walk_stackframe(struct stackframe *frame, int (*fn)(struct stackframe *, void *), void *data) { while (1) { -- cgit v1.2.3 From 88987ef91b99cf99bc5d167caeb31d4958fbf931 Mon Sep 17 00:00:00 2001 From: Catalin Marinas Date: Fri, 24 Jul 2009 12:32:52 +0100 Subject: Thumb-2: Add some .align statements to the .S files Since the Thumb-2 instructions can be 16-bit wide, data in the .text sections may not be aligned to a 32-bit word and this leads to unaligned exceptions. This patch does not affect the ARM code generation. Signed-off-by: Catalin Marinas --- arch/arm/boot/compressed/head.S | 3 +++ arch/arm/kernel/entry-armv.S | 4 ++++ arch/arm/kernel/head-common.S | 2 ++ arch/arm/lib/sha1.S | 2 ++ arch/arm/vfp/entry.S | 2 ++ 5 files changed, 13 insertions(+) (limited to 'arch/arm/kernel') diff --git a/arch/arm/boot/compressed/head.S b/arch/arm/boot/compressed/head.S index 4515728c5345..82f5fcfd9567 100644 --- a/arch/arm/boot/compressed/head.S +++ b/arch/arm/boot/compressed/head.S @@ -291,6 +291,7 @@ wont_overwrite: mov r0, r4 bl decompress_kernel b call_kernel + .align 2 .type LC0, #object LC0: .word LC0 @ r1 .word __bss_start @ r2 @@ -589,6 +590,7 @@ call_cache_fn: adr r12, proc_types * methods. Writeback caches _must_ have the flush method * defined. */ + .align 2 .type proc_types,#object proc_types: .word 0x41560600 @ ARM6/610 @@ -945,6 +947,7 @@ __armv3_mpu_cache_flush: * memory, which again must be relocatable. */ #ifdef DEBUG + .align 2 .type phexbuf,#object phexbuf: .space 12 .size phexbuf, . - phexbuf diff --git a/arch/arm/kernel/entry-armv.S b/arch/arm/kernel/entry-armv.S index fc8af43c5000..0befd1cabf45 100644 --- a/arch/arm/kernel/entry-armv.S +++ b/arch/arm/kernel/entry-armv.S @@ -1065,6 +1065,10 @@ vector_\name: ldr lr, [pc, lr, lsl #2] movs pc, lr @ branch to handler in SVC mode ENDPROC(vector_\name) + + .align 2 + @ handler addresses follow this label +1: .endm .globl __stubs_start diff --git a/arch/arm/kernel/head-common.S b/arch/arm/kernel/head-common.S index 991952c644d1..dbfaef07a1f4 100644 --- a/arch/arm/kernel/head-common.S +++ b/arch/arm/kernel/head-common.S @@ -14,6 +14,7 @@ #define ATAG_CORE 0x54410001 #define ATAG_CORE_SIZE ((2*4 + 3*4) >> 2) + .align 2 .type __switch_data, %object __switch_data: .long __mmap_switched @@ -185,6 +186,7 @@ ENDPROC(lookup_processor_type) * Look in and arch/arm/kernel/arch.[ch] for * more information about the __proc_info and __arch_info structures. */ + .align 2 .long __proc_info_begin .long __proc_info_end 3: .long . diff --git a/arch/arm/lib/sha1.S b/arch/arm/lib/sha1.S index a16fb208c841..09b548cac1a4 100644 --- a/arch/arm/lib/sha1.S +++ b/arch/arm/lib/sha1.S @@ -187,6 +187,7 @@ ENTRY(sha_transform) ENDPROC(sha_transform) + .align 2 .L_sha_K: .word 0x5a827999, 0x6ed9eba1, 0x8f1bbcdc, 0xca62c1d6 @@ -195,6 +196,7 @@ ENDPROC(sha_transform) * void sha_init(__u32 *buf) */ + .align 2 .L_sha_initial_digest: .word 0x67452301, 0xefcdab89, 0x98badcfe, 0x10325476, 0xc3d2e1f0 diff --git a/arch/arm/vfp/entry.S b/arch/arm/vfp/entry.S index a2bed62aec21..4fa9903b83cf 100644 --- a/arch/arm/vfp/entry.S +++ b/arch/arm/vfp/entry.S @@ -42,6 +42,7 @@ ENTRY(vfp_null_entry) mov pc, lr ENDPROC(vfp_null_entry) + .align 2 .LCvfp: .word vfp_vector @@ -61,6 +62,7 @@ ENTRY(vfp_testing_entry) mov pc, r9 @ we have handled the fault ENDPROC(vfp_testing_entry) + .align 2 VFP_arch_address: .word VFP_arch -- cgit v1.2.3 From b86040a59feb255a8193173caa4d5199464433d5 Mon Sep 17 00:00:00 2001 From: Catalin Marinas Date: Fri, 24 Jul 2009 12:32:54 +0100 Subject: Thumb-2: Implementation of the unified start-up and exceptions code This patch implements the ARM/Thumb-2 unified kernel start-up and exception handling code. Signed-off-by: Catalin Marinas --- arch/arm/include/asm/assembler.h | 11 +++ arch/arm/include/asm/futex.h | 1 + arch/arm/kernel/entry-armv.S | 165 +++++++++++++++++++++++---------------- arch/arm/kernel/entry-common.S | 28 ++----- arch/arm/kernel/entry-header.S | 92 ++++++++++++++++++++-- arch/arm/kernel/head-common.S | 13 +-- arch/arm/kernel/head-nommu.S | 11 ++- arch/arm/kernel/head.S | 28 ++++--- arch/arm/kernel/process.c | 2 +- arch/arm/kernel/setup.c | 28 +++++-- arch/arm/kernel/unwind.c | 4 + 11 files changed, 263 insertions(+), 120 deletions(-) (limited to 'arch/arm/kernel') diff --git a/arch/arm/include/asm/assembler.h b/arch/arm/include/asm/assembler.h index 15f8a092b700..1acd1da36d00 100644 --- a/arch/arm/include/asm/assembler.h +++ b/arch/arm/include/asm/assembler.h @@ -127,3 +127,14 @@ #endif #endif .endm + +#ifdef CONFIG_THUMB2_KERNEL + .macro setmode, mode, reg + mov \reg, #\mode + msr cpsr_c, \reg + .endm +#else + .macro setmode, mode, reg + msr cpsr_c, #\mode + .endm +#endif diff --git a/arch/arm/include/asm/futex.h b/arch/arm/include/asm/futex.h index 9ee743b95de8..bfcc15929a7f 100644 --- a/arch/arm/include/asm/futex.h +++ b/arch/arm/include/asm/futex.h @@ -99,6 +99,7 @@ futex_atomic_cmpxchg_inatomic(int __user *uaddr, int oldval, int newval) __asm__ __volatile__("@futex_atomic_cmpxchg_inatomic\n" "1: ldrt %0, [%3]\n" " teq %0, %1\n" + " it eq @ explicit IT needed for the 2b label\n" "2: streqt %2, [%3]\n" "3:\n" " .section __ex_table,\"a\"\n" diff --git a/arch/arm/kernel/entry-armv.S b/arch/arm/kernel/entry-armv.S index 0befd1cabf45..468425f937dd 100644 --- a/arch/arm/kernel/entry-armv.S +++ b/arch/arm/kernel/entry-armv.S @@ -34,7 +34,7 @@ @ @ routine called with r0 = irq number, r1 = struct pt_regs * @ - adrne lr, 1b + adrne lr, BSYM(1b) bne asm_do_IRQ #ifdef CONFIG_SMP @@ -46,13 +46,13 @@ */ test_for_ipi r0, r6, r5, lr movne r0, sp - adrne lr, 1b + adrne lr, BSYM(1b) bne do_IPI #ifdef CONFIG_LOCAL_TIMERS test_for_ltirq r0, r6, r5, lr movne r0, sp - adrne lr, 1b + adrne lr, BSYM(1b) bne do_local_timer #endif #endif @@ -70,7 +70,10 @@ */ .macro inv_entry, reason sub sp, sp, #S_FRAME_SIZE - stmib sp, {r1 - lr} + ARM( stmib sp, {r1 - lr} ) + THUMB( stmia sp, {r0 - r12} ) + THUMB( str sp, [sp, #S_SP] ) + THUMB( str lr, [sp, #S_LR] ) mov r1, #\reason .endm @@ -126,17 +129,24 @@ ENDPROC(__und_invalid) .macro svc_entry, stack_hole=0 UNWIND(.fnstart ) UNWIND(.save {r0 - pc} ) - sub sp, sp, #(S_FRAME_SIZE + \stack_hole) + sub sp, sp, #(S_FRAME_SIZE + \stack_hole - 4) +#ifdef CONFIG_THUMB2_KERNEL + SPFIX( str r0, [sp] ) @ temporarily saved + SPFIX( mov r0, sp ) + SPFIX( tst r0, #4 ) @ test original stack alignment + SPFIX( ldr r0, [sp] ) @ restored +#else SPFIX( tst sp, #4 ) - SPFIX( bicne sp, sp, #4 ) - stmib sp, {r1 - r12} +#endif + SPFIX( subeq sp, sp, #4 ) + stmia sp, {r1 - r12} ldmia r0, {r1 - r3} - add r5, sp, #S_SP @ here for interlock avoidance + add r5, sp, #S_SP - 4 @ here for interlock avoidance mov r4, #-1 @ "" "" "" "" - add r0, sp, #(S_FRAME_SIZE + \stack_hole) - SPFIX( addne r0, r0, #4 ) - str r1, [sp] @ save the "real" r0 copied + add r0, sp, #(S_FRAME_SIZE + \stack_hole - 4) + SPFIX( addeq r0, r0, #4 ) + str r1, [sp, #-4]! @ save the "real" r0 copied @ from the exception stack mov r1, lr @@ -196,9 +206,8 @@ __dabt_svc: @ @ restore SPSR and restart the instruction @ - ldr r0, [sp, #S_PSR] - msr spsr_cxsf, r0 - ldmia sp, {r0 - pc}^ @ load r0 - pc, cpsr + ldr r2, [sp, #S_PSR] + svc_exit r2 @ return from exception UNWIND(.fnend ) ENDPROC(__dabt_svc) @@ -225,13 +234,12 @@ __irq_svc: tst r0, #_TIF_NEED_RESCHED blne svc_preempt #endif - ldr r0, [sp, #S_PSR] @ irqs are already disabled - msr spsr_cxsf, r0 + ldr r4, [sp, #S_PSR] @ irqs are already disabled #ifdef CONFIG_TRACE_IRQFLAGS - tst r0, #PSR_I_BIT + tst r4, #PSR_I_BIT bleq trace_hardirqs_on #endif - ldmia sp, {r0 - pc}^ @ load r0 - pc, cpsr + svc_exit r4 @ return from exception UNWIND(.fnend ) ENDPROC(__irq_svc) @@ -266,7 +274,7 @@ __und_svc: @ r0 - instruction @ ldr r0, [r2, #-4] - adr r9, 1f + adr r9, BSYM(1f) bl call_fpe mov r0, sp @ struct pt_regs *regs @@ -280,9 +288,8 @@ __und_svc: @ @ restore SPSR and restart the instruction @ - ldr lr, [sp, #S_PSR] @ Get SVC cpsr - msr spsr_cxsf, lr - ldmia sp, {r0 - pc}^ @ Restore SVC registers + ldr r2, [sp, #S_PSR] @ Get SVC cpsr + svc_exit r2 @ return from exception UNWIND(.fnend ) ENDPROC(__und_svc) @@ -323,9 +330,8 @@ __pabt_svc: @ @ restore SPSR and restart the instruction @ - ldr r0, [sp, #S_PSR] - msr spsr_cxsf, r0 - ldmia sp, {r0 - pc}^ @ load r0 - pc, cpsr + ldr r2, [sp, #S_PSR] + svc_exit r2 @ return from exception UNWIND(.fnend ) ENDPROC(__pabt_svc) @@ -353,7 +359,8 @@ ENDPROC(__pabt_svc) UNWIND(.fnstart ) UNWIND(.cantunwind ) @ don't unwind the user space sub sp, sp, #S_FRAME_SIZE - stmib sp, {r1 - r12} + ARM( stmib sp, {r1 - r12} ) + THUMB( stmia sp, {r0 - r12} ) ldmia r0, {r1 - r3} add r0, sp, #S_PC @ here for interlock avoidance @@ -372,7 +379,8 @@ ENDPROC(__pabt_svc) @ Also, separately save sp_usr and lr_usr @ stmia r0, {r2 - r4} - stmdb r0, {sp, lr}^ + ARM( stmdb r0, {sp, lr}^ ) + THUMB( store_user_sp_lr r0, r1, S_SP - S_PC ) @ @ Enable the alignment trap while in kernel mode @@ -427,7 +435,7 @@ __dabt_usr: @ enable_irq mov r2, sp - adr lr, ret_from_exception + adr lr, BSYM(ret_from_exception) b do_DataAbort UNWIND(.fnend ) ENDPROC(__dabt_usr) @@ -452,7 +460,9 @@ __irq_usr: ldr r0, [tsk, #TI_PREEMPT] str r8, [tsk, #TI_PREEMPT] teq r0, r7 - strne r0, [r0, -r0] + ARM( strne r0, [r0, -r0] ) + THUMB( movne r0, #0 ) + THUMB( strne r0, [r0] ) #endif #ifdef CONFIG_TRACE_IRQFLAGS bl trace_hardirqs_on @@ -476,9 +486,10 @@ __und_usr: @ @ r0 - instruction @ - adr r9, ret_from_exception - adr lr, __und_usr_unknown + adr r9, BSYM(ret_from_exception) + adr lr, BSYM(__und_usr_unknown) tst r3, #PSR_T_BIT @ Thumb mode? + itet eq @ explicit IT needed for the 1f label subeq r4, r2, #4 @ ARM instr at LR - 4 subne r4, r2, #2 @ Thumb instr at LR - 2 1: ldreqt r0, [r4] @@ -488,7 +499,10 @@ __und_usr: beq call_fpe @ Thumb instruction #if __LINUX_ARM_ARCH__ >= 7 -2: ldrht r5, [r4], #2 +2: + ARM( ldrht r5, [r4], #2 ) + THUMB( ldrht r5, [r4] ) + THUMB( add r4, r4, #2 ) and r0, r5, #0xf800 @ mask bits 111x x... .... .... cmp r0, #0xe800 @ 32bit instruction if xx != 0 blo __und_usr_unknown @@ -577,9 +591,11 @@ call_fpe: moveq pc, lr get_thread_info r10 @ get current thread and r8, r0, #0x00000f00 @ mask out CP number + THUMB( lsr r8, r8, #8 ) mov r7, #1 add r6, r10, #TI_USED_CP - strb r7, [r6, r8, lsr #8] @ set appropriate used_cp[] + ARM( strb r7, [r6, r8, lsr #8] ) @ set appropriate used_cp[] + THUMB( strb r7, [r6, r8] ) @ set appropriate used_cp[] #ifdef CONFIG_IWMMXT @ Test if we need to give access to iWMMXt coprocessors ldr r5, [r10, #TI_FLAGS] @@ -587,36 +603,38 @@ call_fpe: movcss r7, r5, lsr #(TIF_USING_IWMMXT + 1) bcs iwmmxt_task_enable #endif - add pc, pc, r8, lsr #6 - mov r0, r0 - - mov pc, lr @ CP#0 - b do_fpe @ CP#1 (FPE) - b do_fpe @ CP#2 (FPE) - mov pc, lr @ CP#3 + ARM( add pc, pc, r8, lsr #6 ) + THUMB( lsl r8, r8, #2 ) + THUMB( add pc, r8 ) + nop + + W(mov) pc, lr @ CP#0 + W(b) do_fpe @ CP#1 (FPE) + W(b) do_fpe @ CP#2 (FPE) + W(mov) pc, lr @ CP#3 #ifdef CONFIG_CRUNCH b crunch_task_enable @ CP#4 (MaverickCrunch) b crunch_task_enable @ CP#5 (MaverickCrunch) b crunch_task_enable @ CP#6 (MaverickCrunch) #else - mov pc, lr @ CP#4 - mov pc, lr @ CP#5 - mov pc, lr @ CP#6 + W(mov) pc, lr @ CP#4 + W(mov) pc, lr @ CP#5 + W(mov) pc, lr @ CP#6 #endif - mov pc, lr @ CP#7 - mov pc, lr @ CP#8 - mov pc, lr @ CP#9 + W(mov) pc, lr @ CP#7 + W(mov) pc, lr @ CP#8 + W(mov) pc, lr @ CP#9 #ifdef CONFIG_VFP - b do_vfp @ CP#10 (VFP) - b do_vfp @ CP#11 (VFP) + W(b) do_vfp @ CP#10 (VFP) + W(b) do_vfp @ CP#11 (VFP) #else - mov pc, lr @ CP#10 (VFP) - mov pc, lr @ CP#11 (VFP) + W(mov) pc, lr @ CP#10 (VFP) + W(mov) pc, lr @ CP#11 (VFP) #endif - mov pc, lr @ CP#12 - mov pc, lr @ CP#13 - mov pc, lr @ CP#14 (Debug) - mov pc, lr @ CP#15 (Control) + W(mov) pc, lr @ CP#12 + W(mov) pc, lr @ CP#13 + W(mov) pc, lr @ CP#14 (Debug) + W(mov) pc, lr @ CP#15 (Control) #ifdef CONFIG_NEON .align 6 @@ -667,7 +685,7 @@ no_fp: mov pc, lr __und_usr_unknown: enable_irq mov r0, sp - adr lr, ret_from_exception + adr lr, BSYM(ret_from_exception) b do_undefinstr ENDPROC(__und_usr_unknown) @@ -711,7 +729,10 @@ ENTRY(__switch_to) UNWIND(.cantunwind ) add ip, r1, #TI_CPU_SAVE ldr r3, [r2, #TI_TP_VALUE] - stmia ip!, {r4 - sl, fp, sp, lr} @ Store most regs on stack + ARM( stmia ip!, {r4 - sl, fp, sp, lr} ) @ Store most regs on stack + THUMB( stmia ip!, {r4 - sl, fp} ) @ Store most regs on stack + THUMB( str sp, [ip], #4 ) + THUMB( str lr, [ip], #4 ) #ifdef CONFIG_MMU ldr r6, [r2, #TI_CPU_DOMAIN] #endif @@ -736,8 +757,12 @@ ENTRY(__switch_to) ldr r0, =thread_notify_head mov r1, #THREAD_NOTIFY_SWITCH bl atomic_notifier_call_chain + THUMB( mov ip, r4 ) mov r0, r5 - ldmia r4, {r4 - sl, fp, sp, pc} @ Load all regs saved previously + ARM( ldmia r4, {r4 - sl, fp, sp, pc} ) @ Load all regs saved previously + THUMB( ldmia ip!, {r4 - sl, fp} ) @ Load all regs saved previously + THUMB( ldr sp, [ip], #4 ) + THUMB( ldr pc, [ip] ) UNWIND(.fnend ) ENDPROC(__switch_to) @@ -772,6 +797,7 @@ ENDPROC(__switch_to) * if your compiled code is not going to use the new instructions for other * purpose. */ + THUMB( .arm ) .macro usr_ret, reg #ifdef CONFIG_ARM_THUMB @@ -1020,6 +1046,7 @@ __kuser_helper_version: @ 0xffff0ffc .globl __kuser_helper_end __kuser_helper_end: + THUMB( .thumb ) /* * Vector stubs. @@ -1054,15 +1081,17 @@ vector_\name: @ Prepare for SVC32 mode. IRQs remain disabled. @ mrs r0, cpsr - eor r0, r0, #(\mode ^ SVC_MODE) + eor r0, r0, #(\mode ^ SVC_MODE | PSR_ISETSTATE) msr spsr_cxsf, r0 @ @ the branch table must immediately follow this code @ and lr, lr, #0x0f + THUMB( adr r0, 1f ) + THUMB( ldr lr, [r0, lr, lsl #2] ) mov r0, sp - ldr lr, [pc, lr, lsl #2] + ARM( ldr lr, [pc, lr, lsl #2] ) movs pc, lr @ branch to handler in SVC mode ENDPROC(vector_\name) @@ -1206,14 +1235,16 @@ __stubs_end: .globl __vectors_start __vectors_start: - swi SYS_ERROR0 - b vector_und + stubs_offset - ldr pc, .LCvswi + stubs_offset - b vector_pabt + stubs_offset - b vector_dabt + stubs_offset - b vector_addrexcptn + stubs_offset - b vector_irq + stubs_offset - b vector_fiq + stubs_offset + ARM( swi SYS_ERROR0 ) + THUMB( svc #0 ) + THUMB( nop ) + W(b) vector_und + stubs_offset + W(ldr) pc, .LCvswi + stubs_offset + W(b) vector_pabt + stubs_offset + W(b) vector_dabt + stubs_offset + W(b) vector_addrexcptn + stubs_offset + W(b) vector_irq + stubs_offset + W(b) vector_fiq + stubs_offset .globl __vectors_end __vectors_end: diff --git a/arch/arm/kernel/entry-common.S b/arch/arm/kernel/entry-common.S index 366e5097a41a..a0540c9f1f0c 100644 --- a/arch/arm/kernel/entry-common.S +++ b/arch/arm/kernel/entry-common.S @@ -33,14 +33,7 @@ ret_fast_syscall: /* perform architecture specific actions before user return */ arch_ret_to_user r1, lr - @ fast_restore_user_regs - ldr r1, [sp, #S_OFF + S_PSR] @ get calling cpsr - ldr lr, [sp, #S_OFF + S_PC]! @ get pc - msr spsr_cxsf, r1 @ save in spsr_svc - ldmdb sp, {r1 - lr}^ @ get calling r1 - lr - mov r0, r0 - add sp, sp, #S_FRAME_SIZE - S_PC - movs pc, lr @ return & move spsr_svc into cpsr + restore_user_regs fast = 1, offset = S_OFF UNWIND(.fnend ) /* @@ -73,14 +66,7 @@ no_work_pending: /* perform architecture specific actions before user return */ arch_ret_to_user r1, lr - @ slow_restore_user_regs - ldr r1, [sp, #S_PSR] @ get calling cpsr - ldr lr, [sp, #S_PC]! @ get pc - msr spsr_cxsf, r1 @ save in spsr_svc - ldmdb sp, {r0 - lr}^ @ get calling r0 - lr - mov r0, r0 - add sp, sp, #S_FRAME_SIZE - S_PC - movs pc, lr @ return & move spsr_svc into cpsr + restore_user_regs fast = 0, offset = 0 ENDPROC(ret_to_user) /* @@ -182,8 +168,10 @@ ftrace_stub: ENTRY(vector_swi) sub sp, sp, #S_FRAME_SIZE stmia sp, {r0 - r12} @ Calling r0 - r12 - add r8, sp, #S_PC - stmdb r8, {sp, lr}^ @ Calling sp, lr + ARM( add r8, sp, #S_PC ) + ARM( stmdb r8, {sp, lr}^ ) @ Calling sp, lr + THUMB( mov r8, sp ) + THUMB( store_user_sp_lr r8, r10, S_SP ) @ calling sp, lr mrs r8, spsr @ called from non-FIQ mode, so ok. str lr, [sp, #S_PC] @ Save calling PC str r8, [sp, #S_PSR] @ Save CPSR @@ -272,7 +260,7 @@ ENTRY(vector_swi) bne __sys_trace cmp scno, #NR_syscalls @ check upper syscall limit - adr lr, ret_fast_syscall @ return address + adr lr, BSYM(ret_fast_syscall) @ return address ldrcc pc, [tbl, scno, lsl #2] @ call sys_* routine add r1, sp, #S_OFF @@ -293,7 +281,7 @@ __sys_trace: mov r0, #0 @ trace entry [IP = 0] bl syscall_trace - adr lr, __sys_trace_return @ return address + adr lr, BSYM(__sys_trace_return) @ return address mov scno, r0 @ syscall number (possibly new) add r1, sp, #S_R0 + S_OFF @ pointer to regs cmp scno, #NR_syscalls @ check upper syscall limit diff --git a/arch/arm/kernel/entry-header.S b/arch/arm/kernel/entry-header.S index 87ab4e157997..a4eaf4f920c5 100644 --- a/arch/arm/kernel/entry-header.S +++ b/arch/arm/kernel/entry-header.S @@ -36,11 +36,6 @@ #endif .endm - .macro get_thread_info, rd - mov \rd, sp, lsr #13 - mov \rd, \rd, lsl #13 - .endm - .macro alignment_trap, rtemp #ifdef CONFIG_ALIGNMENT_TRAP ldr \rtemp, .LCcralign @@ -49,6 +44,93 @@ #endif .endm + @ + @ Store/load the USER SP and LR registers by switching to the SYS + @ mode. Useful in Thumb-2 mode where "stm/ldm rd, {sp, lr}^" is not + @ available. Should only be called from SVC mode + @ + .macro store_user_sp_lr, rd, rtemp, offset = 0 + mrs \rtemp, cpsr + eor \rtemp, \rtemp, #(SVC_MODE ^ SYSTEM_MODE) + msr cpsr_c, \rtemp @ switch to the SYS mode + + str sp, [\rd, #\offset] @ save sp_usr + str lr, [\rd, #\offset + 4] @ save lr_usr + + eor \rtemp, \rtemp, #(SVC_MODE ^ SYSTEM_MODE) + msr cpsr_c, \rtemp @ switch back to the SVC mode + .endm + + .macro load_user_sp_lr, rd, rtemp, offset = 0 + mrs \rtemp, cpsr + eor \rtemp, \rtemp, #(SVC_MODE ^ SYSTEM_MODE) + msr cpsr_c, \rtemp @ switch to the SYS mode + + ldr sp, [\rd, #\offset] @ load sp_usr + ldr lr, [\rd, #\offset + 4] @ load lr_usr + + eor \rtemp, \rtemp, #(SVC_MODE ^ SYSTEM_MODE) + msr cpsr_c, \rtemp @ switch back to the SVC mode + .endm + +#ifndef CONFIG_THUMB2_KERNEL + .macro svc_exit, rpsr + msr spsr_cxsf, \rpsr + ldmia sp, {r0 - pc}^ @ load r0 - pc, cpsr + .endm + + .macro restore_user_regs, fast = 0, offset = 0 + ldr r1, [sp, #\offset + S_PSR] @ get calling cpsr + ldr lr, [sp, #\offset + S_PC]! @ get pc + msr spsr_cxsf, r1 @ save in spsr_svc + .if \fast + ldmdb sp, {r1 - lr}^ @ get calling r1 - lr + .else + ldmdb sp, {r0 - lr}^ @ get calling r0 - lr + .endif + add sp, sp, #S_FRAME_SIZE - S_PC + movs pc, lr @ return & move spsr_svc into cpsr + .endm + + .macro get_thread_info, rd + mov \rd, sp, lsr #13 + mov \rd, \rd, lsl #13 + .endm +#else /* CONFIG_THUMB2_KERNEL */ + .macro svc_exit, rpsr + ldr r0, [sp, #S_SP] @ top of the stack + ldr r1, [sp, #S_PC] @ return address + tst r0, #4 @ orig stack 8-byte aligned? + stmdb r0, {r1, \rpsr} @ rfe context + ldmia sp, {r0 - r12} + ldr lr, [sp, #S_LR] + addeq sp, sp, #S_FRAME_SIZE - 8 @ aligned + addne sp, sp, #S_FRAME_SIZE - 4 @ not aligned + rfeia sp! + .endm + + .macro restore_user_regs, fast = 0, offset = 0 + mov r2, sp + load_user_sp_lr r2, r3, \offset + S_SP @ calling sp, lr + ldr r1, [sp, #\offset + S_PSR] @ get calling cpsr + ldr lr, [sp, #\offset + S_PC] @ get pc + add sp, sp, #\offset + S_SP + msr spsr_cxsf, r1 @ save in spsr_svc + .if \fast + ldmdb sp, {r1 - r12} @ get calling r1 - r12 + .else + ldmdb sp, {r0 - r12} @ get calling r0 - r12 + .endif + add sp, sp, #S_FRAME_SIZE - S_SP + movs pc, lr @ return & move spsr_svc into cpsr + .endm + + .macro get_thread_info, rd + mov \rd, sp + lsr \rd, \rd, #13 + mov \rd, \rd, lsl #13 + .endm +#endif /* !CONFIG_THUMB2_KERNEL */ /* * These are the registers used in the syscall handler, and allow us to diff --git a/arch/arm/kernel/head-common.S b/arch/arm/kernel/head-common.S index dbfaef07a1f4..93ad576b2d74 100644 --- a/arch/arm/kernel/head-common.S +++ b/arch/arm/kernel/head-common.S @@ -52,7 +52,9 @@ __mmap_switched: strcc fp, [r6],#4 bcc 1b - ldmia r3, {r4, r5, r6, r7, sp} + ARM( ldmia r3, {r4, r5, r6, r7, sp}) + THUMB( ldmia r3, {r4, r5, r6, r7} ) + THUMB( ldr sp, [r3, #16] ) str r9, [r4] @ Save processor ID str r1, [r5] @ Save machine type str r2, [r6] @ Save atags pointer @@ -156,7 +158,8 @@ ENDPROC(__error) */ __lookup_processor_type: adr r3, 3f - ldmda r3, {r5 - r7} + ldmia r3, {r5 - r7} + add r3, r3, #8 sub r3, r3, r7 @ get offset between virt&phys add r5, r5, r3 @ convert virt addresses to add r6, r6, r3 @ physical address space @@ -187,9 +190,9 @@ ENDPROC(lookup_processor_type) * more information about the __proc_info and __arch_info structures. */ .align 2 - .long __proc_info_begin +3: .long __proc_info_begin .long __proc_info_end -3: .long . +4: .long . .long __arch_info_begin .long __arch_info_end @@ -205,7 +208,7 @@ ENDPROC(lookup_processor_type) * r5 = mach_info pointer in physical address space */ __lookup_machine_type: - adr r3, 3b + adr r3, 4b ldmia r3, {r4, r5, r6} sub r3, r3, r4 @ get offset between virt&phys add r5, r5, r3 @ convert virt addresses to diff --git a/arch/arm/kernel/head-nommu.S b/arch/arm/kernel/head-nommu.S index cc87e1765ed2..b16393d2b71e 100644 --- a/arch/arm/kernel/head-nommu.S +++ b/arch/arm/kernel/head-nommu.S @@ -34,7 +34,7 @@ */ .section ".text.head", "ax" ENTRY(stext) - msr cpsr_c, #PSR_F_BIT | PSR_I_BIT | SVC_MODE @ ensure svc mode + setmode PSR_F_BIT | PSR_I_BIT | SVC_MODE, r9 @ ensure svc mode @ and irqs disabled #ifndef CONFIG_CPU_CP15 ldr r9, =CONFIG_PROCESSOR_ID @@ -50,8 +50,10 @@ ENTRY(stext) ldr r13, __switch_data @ address to jump to after @ the initialization is done - adr lr, __after_proc_init @ return (PIC) address - add pc, r10, #PROCINFO_INITFUNC + adr lr, BSYM(__after_proc_init) @ return (PIC) address + ARM( add pc, r10, #PROCINFO_INITFUNC ) + THUMB( add r12, r10, #PROCINFO_INITFUNC ) + THUMB( mov pc, r12 ) ENDPROC(stext) /* @@ -82,7 +84,8 @@ __after_proc_init: mcr p15, 0, r0, c1, c0, 0 @ write control reg #endif /* CONFIG_CPU_CP15 */ - mov pc, r13 @ clear the BSS and jump + mov r3, r13 + mov pc, r3 @ clear the BSS and jump @ to start_kernel ENDPROC(__after_proc_init) .ltorg diff --git a/arch/arm/kernel/head.S b/arch/arm/kernel/head.S index 21e17dc94cb5..38ccbe1d3b2c 100644 --- a/arch/arm/kernel/head.S +++ b/arch/arm/kernel/head.S @@ -76,7 +76,7 @@ */ .section ".text.head", "ax" ENTRY(stext) - msr cpsr_c, #PSR_F_BIT | PSR_I_BIT | SVC_MODE @ ensure svc mode + setmode PSR_F_BIT | PSR_I_BIT | SVC_MODE, r9 @ ensure svc mode @ and irqs disabled mrc p15, 0, r9, c0, c0 @ get processor id bl __lookup_processor_type @ r5=procinfo r9=cpuid @@ -97,8 +97,10 @@ ENTRY(stext) */ ldr r13, __switch_data @ address to jump to after @ mmu has been enabled - adr lr, __enable_mmu @ return (PIC) address - add pc, r10, #PROCINFO_INITFUNC + adr lr, BSYM(__enable_mmu) @ return (PIC) address + ARM( add pc, r10, #PROCINFO_INITFUNC ) + THUMB( add r12, r10, #PROCINFO_INITFUNC ) + THUMB( mov pc, r12 ) ENDPROC(stext) #if defined(CONFIG_SMP) @@ -110,7 +112,7 @@ ENTRY(secondary_startup) * the processor type - there is no need to check the machine type * as it has already been validated by the primary processor. */ - msr cpsr_c, #PSR_F_BIT | PSR_I_BIT | SVC_MODE + setmode PSR_F_BIT | PSR_I_BIT | SVC_MODE, r9 mrc p15, 0, r9, c0, c0 @ get processor id bl __lookup_processor_type movs r10, r5 @ invalid processor? @@ -121,12 +123,15 @@ ENTRY(secondary_startup) * Use the page tables supplied from __cpu_up. */ adr r4, __secondary_data - ldmia r4, {r5, r7, r13} @ address to jump to after + ldmia r4, {r5, r7, r12} @ address to jump to after sub r4, r4, r5 @ mmu has been enabled ldr r4, [r7, r4] @ get secondary_data.pgdir - adr lr, __enable_mmu @ return address - add pc, r10, #PROCINFO_INITFUNC @ initialise processor - @ (return control reg) + adr lr, BSYM(__enable_mmu) @ return address + mov r13, r12 @ __secondary_switched address + ARM( add pc, r10, #PROCINFO_INITFUNC ) @ initialise processor + @ (return control reg) + THUMB( add r12, r10, #PROCINFO_INITFUNC ) + THUMB( mov pc, r12 ) ENDPROC(secondary_startup) /* @@ -193,8 +198,8 @@ __turn_mmu_on: mcr p15, 0, r0, c1, c0, 0 @ write control reg mrc p15, 0, r3, c0, c0, 0 @ read id reg mov r3, r3 - mov r3, r3 - mov pc, r13 + mov r3, r13 + mov pc, r3 ENDPROC(__turn_mmu_on) @@ -235,7 +240,8 @@ __create_page_tables: * will be removed by paging_init(). We use our current program * counter to determine corresponding section base address. */ - mov r6, pc, lsr #20 @ start of kernel section + mov r6, pc + mov r6, r6, lsr #20 @ start of kernel section orr r3, r7, r6, lsl #20 @ flags + kernel base str r3, [r4, r6, lsl #2] @ identity mapping diff --git a/arch/arm/kernel/process.c b/arch/arm/kernel/process.c index 39196dff478c..790fbee92ec5 100644 --- a/arch/arm/kernel/process.c +++ b/arch/arm/kernel/process.c @@ -388,7 +388,7 @@ pid_t kernel_thread(int (*fn)(void *), void *arg, unsigned long flags) regs.ARM_r2 = (unsigned long)fn; regs.ARM_r3 = (unsigned long)kernel_thread_exit; regs.ARM_pc = (unsigned long)kernel_thread_helper; - regs.ARM_cpsr = SVC_MODE | PSR_ENDSTATE; + regs.ARM_cpsr = SVC_MODE | PSR_ENDSTATE | PSR_ISETSTATE; return do_fork(flags|CLONE_VM|CLONE_UNTRACED, 0, ®s, 0, NULL, NULL); } diff --git a/arch/arm/kernel/setup.c b/arch/arm/kernel/setup.c index bc5e4128f9f3..d4d4f77c91b2 100644 --- a/arch/arm/kernel/setup.c +++ b/arch/arm/kernel/setup.c @@ -25,6 +25,7 @@ #include #include +#include #include #include #include @@ -326,26 +327,39 @@ void cpu_init(void) BUG(); } + /* + * Define the placement constraint for the inline asm directive below. + * In Thumb-2, msr with an immediate value is not allowed. + */ +#ifdef CONFIG_THUMB2_KERNEL +#define PLC "r" +#else +#define PLC "I" +#endif + /* * setup stacks for re-entrant exception handlers */ __asm__ ( "msr cpsr_c, %1\n\t" - "add sp, %0, %2\n\t" + "add r14, %0, %2\n\t" + "mov sp, r14\n\t" "msr cpsr_c, %3\n\t" - "add sp, %0, %4\n\t" + "add r14, %0, %4\n\t" + "mov sp, r14\n\t" "msr cpsr_c, %5\n\t" - "add sp, %0, %6\n\t" + "add r14, %0, %6\n\t" + "mov sp, r14\n\t" "msr cpsr_c, %7" : : "r" (stk), - "I" (PSR_F_BIT | PSR_I_BIT | IRQ_MODE), + PLC (PSR_F_BIT | PSR_I_BIT | IRQ_MODE), "I" (offsetof(struct stack, irq[0])), - "I" (PSR_F_BIT | PSR_I_BIT | ABT_MODE), + PLC (PSR_F_BIT | PSR_I_BIT | ABT_MODE), "I" (offsetof(struct stack, abt[0])), - "I" (PSR_F_BIT | PSR_I_BIT | UND_MODE), + PLC (PSR_F_BIT | PSR_I_BIT | UND_MODE), "I" (offsetof(struct stack, und[0])), - "I" (PSR_F_BIT | PSR_I_BIT | SVC_MODE) + PLC (PSR_F_BIT | PSR_I_BIT | SVC_MODE) : "r14"); } diff --git a/arch/arm/kernel/unwind.c b/arch/arm/kernel/unwind.c index dd56e11f339a..39baf1128bfa 100644 --- a/arch/arm/kernel/unwind.c +++ b/arch/arm/kernel/unwind.c @@ -62,7 +62,11 @@ struct unwind_ctrl_block { }; enum regs { +#ifdef CONFIG_THUMB2_KERNEL + FP = 7, +#else FP = 11, +#endif SP = 13, LR = 14, PC = 15 -- cgit v1.2.3 From adca6dc23bc620ea95392659625200a252b97be3 Mon Sep 17 00:00:00 2001 From: Catalin Marinas Date: Fri, 24 Jul 2009 12:32:59 +0100 Subject: Thumb-2: Add support for loadable modules Modules compiled to Thumb-2 have two additional relocations needing to be resolved at load time, R_ARM_THM_CALL and R_ARM_THM_JUMP24, for BL and B.W instructions. The maximum Thumb-2 addressing range is +/-2^24 (+/-16MB) therefore the MODULES_VADDR macro in asm/memory.h is set to (MODULES_END - 8MB) for the Thumb-2 compiled kernel. Signed-off-by: Catalin Marinas --- arch/arm/include/asm/elf.h | 3 +++ arch/arm/include/asm/memory.h | 6 +++++ arch/arm/kernel/module.c | 53 +++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 62 insertions(+) (limited to 'arch/arm/kernel') diff --git a/arch/arm/include/asm/elf.h b/arch/arm/include/asm/elf.h index c207504de84d..c3b911ee9151 100644 --- a/arch/arm/include/asm/elf.h +++ b/arch/arm/include/asm/elf.h @@ -55,6 +55,9 @@ typedef struct user_fp elf_fpregset_t; #define R_ARM_MOVW_ABS_NC 43 #define R_ARM_MOVT_ABS 44 +#define R_ARM_THM_CALL 10 +#define R_ARM_THM_JUMP24 30 + /* * These are used to set parameters in the core dumps. */ diff --git a/arch/arm/include/asm/memory.h b/arch/arm/include/asm/memory.h index 85763db87449..376be1a62866 100644 --- a/arch/arm/include/asm/memory.h +++ b/arch/arm/include/asm/memory.h @@ -44,7 +44,13 @@ * The module space lives between the addresses given by TASK_SIZE * and PAGE_OFFSET - it must be within 32MB of the kernel text. */ +#ifndef CONFIG_THUMB2_KERNEL #define MODULES_VADDR (PAGE_OFFSET - 16*1024*1024) +#else +/* smaller range for Thumb-2 symbols relocation (2^24)*/ +#define MODULES_VADDR (PAGE_OFFSET - 8*1024*1024) +#endif + #if TASK_SIZE > MODULES_VADDR #error Top of user space clashes with start of module space #endif diff --git a/arch/arm/kernel/module.c b/arch/arm/kernel/module.c index bac03c81489d..f28c5e9c51ea 100644 --- a/arch/arm/kernel/module.c +++ b/arch/arm/kernel/module.c @@ -102,6 +102,7 @@ apply_relocate(Elf32_Shdr *sechdrs, const char *strtab, unsigned int symindex, unsigned long loc; Elf32_Sym *sym; s32 offset; + u32 upper, lower, sign, j1, j2; offset = ELF32_R_SYM(rel->r_info); if (offset < 0 || offset > (symsec->sh_size / sizeof(Elf32_Sym))) { @@ -184,6 +185,58 @@ apply_relocate(Elf32_Shdr *sechdrs, const char *strtab, unsigned int symindex, (offset & 0x0fff); break; + case R_ARM_THM_CALL: + case R_ARM_THM_JUMP24: + upper = *(u16 *)loc; + lower = *(u16 *)(loc + 2); + + /* + * 25 bit signed address range (Thumb-2 BL and B.W + * instructions): + * S:I1:I2:imm10:imm11:0 + * where: + * S = upper[10] = offset[24] + * I1 = ~(J1 ^ S) = offset[23] + * I2 = ~(J2 ^ S) = offset[22] + * imm10 = upper[9:0] = offset[21:12] + * imm11 = lower[10:0] = offset[11:1] + * J1 = lower[13] + * J2 = lower[11] + */ + sign = (upper >> 10) & 1; + j1 = (lower >> 13) & 1; + j2 = (lower >> 11) & 1; + offset = (sign << 24) | ((~(j1 ^ sign) & 1) << 23) | + ((~(j2 ^ sign) & 1) << 22) | + ((upper & 0x03ff) << 12) | + ((lower & 0x07ff) << 1); + if (offset & 0x01000000) + offset -= 0x02000000; + offset += sym->st_value - loc; + + /* only Thumb addresses allowed (no interworking) */ + if (!(offset & 1) || + offset <= (s32)0xff000000 || + offset >= (s32)0x01000000) { + printk(KERN_ERR + "%s: relocation out of range, section " + "%d reloc %d sym '%s'\n", module->name, + relindex, i, strtab + sym->st_name); + return -ENOEXEC; + } + + sign = (offset >> 24) & 1; + j1 = sign ^ (~(offset >> 23) & 1); + j2 = sign ^ (~(offset >> 22) & 1); + *(u16 *)loc = (u16)((upper & 0xf800) | (sign << 10) | + ((offset >> 12) & 0x03ff)); + *(u16 *)(loc + 2) = (u16)((lower & 0xd000) | + (j1 << 13) | (j2 << 11) | + ((offset >> 1) & 0x07ff)); + upper = *(u16 *)loc; + lower = *(u16 *)(loc + 2); + break; + default: printk(KERN_ERR "%s: unknown relocation: %u\n", module->name, ELF32_R_TYPE(rel->r_info)); -- cgit v1.2.3 From 68b7f7153fa58df710924fbb79722717d2d16094 Mon Sep 17 00:00:00 2001 From: Paul Brook Date: Fri, 24 Jul 2009 12:34:58 +0100 Subject: nommu: ptrace support The patch below adds ARM ptrace functions to get the process load address. This is required for useful userspace debugging on mmuless systems. These values are obtained by reading magic offsets with PTRACE_PEEKUSR, as on other nommu targets. I picked arbitrary large values for the offsets. Signed-off-by: Paul Brook --- arch/arm/include/asm/ptrace.h | 8 ++++++++ arch/arm/kernel/ptrace.c | 8 +++++++- 2 files changed, 15 insertions(+), 1 deletion(-) (limited to 'arch/arm/kernel') diff --git a/arch/arm/include/asm/ptrace.h b/arch/arm/include/asm/ptrace.h index 67b833c9b6b9..bbecccda76d0 100644 --- a/arch/arm/include/asm/ptrace.h +++ b/arch/arm/include/asm/ptrace.h @@ -82,6 +82,14 @@ #define PSR_ENDSTATE 0 #endif +/* + * These are 'magic' values for PTRACE_PEEKUSR that return info about where a + * process is located in memory. + */ +#define PT_TEXT_ADDR 0x10000 +#define PT_DATA_ADDR 0x10004 +#define PT_TEXT_END_ADDR 0x10008 + #ifndef __ASSEMBLY__ /* diff --git a/arch/arm/kernel/ptrace.c b/arch/arm/kernel/ptrace.c index 89882a1d0187..a2ea3854cb3c 100644 --- a/arch/arm/kernel/ptrace.c +++ b/arch/arm/kernel/ptrace.c @@ -521,7 +521,13 @@ static int ptrace_read_user(struct task_struct *tsk, unsigned long off, return -EIO; tmp = 0; - if (off < sizeof(struct pt_regs)) + if (off == PT_TEXT_ADDR) + tmp = tsk->mm->start_code; + else if (off == PT_DATA_ADDR) + tmp = tsk->mm->start_data; + else if (off == PT_TEXT_END_ADDR) + tmp = tsk->mm->end_code; + else if (off < sizeof(struct pt_regs)) tmp = get_user_reg(tsk, off >> 2); return put_user(tmp, ret); -- cgit v1.2.3 From 05efde9d04ccc1d66a9d2225527c6ee638baa385 Mon Sep 17 00:00:00 2001 From: Catalin Marinas Date: Fri, 24 Jul 2009 12:34:59 +0100 Subject: nommu: Do not override the CP15 control reg value returned from initfunc The patch removes the "mrc" instruction in head-nommu.S overriding the r0 register containing the value to be written in the CP15 system control register. Signed-off-by: Catalin Marinas --- arch/arm/kernel/head-nommu.S | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'arch/arm/kernel') diff --git a/arch/arm/kernel/head-nommu.S b/arch/arm/kernel/head-nommu.S index b16393d2b71e..e5dfc2895e24 100644 --- a/arch/arm/kernel/head-nommu.S +++ b/arch/arm/kernel/head-nommu.S @@ -61,7 +61,10 @@ ENDPROC(stext) */ __after_proc_init: #ifdef CONFIG_CPU_CP15 - mrc p15, 0, r0, c1, c0, 0 @ read control reg + /* + * CP15 system control register value returned in r0 from + * the CPU init function. + */ #ifdef CONFIG_ALIGNMENT_TRAP orr r0, r0, #CR_A #else -- cgit v1.2.3 From 181f817eaaca4c1f8a9c265d339d2b96de8b245d Mon Sep 17 00:00:00 2001 From: Uwe Kleine-König Date: Thu, 13 Aug 2009 20:38:16 +0200 Subject: [ARM] support tracing when using newer compilers MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Since gcc 4.4 the name and calling convention for function profiling on ARM changed. With this patch both types are supported. See http://sourceware.org/ml/libc-ports/2008-04/msg00009.html for some details. Lightly-Tested-by: Anand Gadiyar Tested-by: Kevin Hilman Signed-off-by: Uwe Kleine-König --- arch/arm/include/asm/ftrace.h | 1 + arch/arm/kernel/armksyms.c | 1 + arch/arm/kernel/entry-common.S | 19 +++++++++++++++++++ 3 files changed, 21 insertions(+) (limited to 'arch/arm/kernel') diff --git a/arch/arm/include/asm/ftrace.h b/arch/arm/include/asm/ftrace.h index 39c8bc1a006a..0d4c478e01b6 100644 --- a/arch/arm/include/asm/ftrace.h +++ b/arch/arm/include/asm/ftrace.h @@ -7,6 +7,7 @@ #ifndef __ASSEMBLY__ extern void mcount(void); +extern void __gnu_mcount_nc(void); #endif #endif diff --git a/arch/arm/kernel/armksyms.c b/arch/arm/kernel/armksyms.c index 531e1860e546..0e627705f746 100644 --- a/arch/arm/kernel/armksyms.c +++ b/arch/arm/kernel/armksyms.c @@ -186,4 +186,5 @@ EXPORT_SYMBOL(_find_next_bit_be); #ifdef CONFIG_FUNCTION_TRACER EXPORT_SYMBOL(mcount); +EXPORT_SYMBOL(__gnu_mcount_nc); #endif diff --git a/arch/arm/kernel/entry-common.S b/arch/arm/kernel/entry-common.S index 366e5097a41a..99208728d48f 100644 --- a/arch/arm/kernel/entry-common.S +++ b/arch/arm/kernel/entry-common.S @@ -132,6 +132,25 @@ ftrace_call: #else +ENTRY(__gnu_mcount_nc) + stmdb sp!, {r0-r3, lr} + ldr r0, =ftrace_trace_function + ldr r2, [r0] + adr r0, ftrace_stub + cmp r0, r2 + bne gnu_trace + ldmia sp!, {r0-r3, ip, lr} + bx ip + +gnu_trace: + ldr r1, [sp, #20] @ lr of instrumented routine + mov r0, lr + sub r0, r0, #MCOUNT_INSN_SIZE + mov lr, pc + mov pc, r2 + ldmia sp!, {r0-r3, ip, lr} + bx ip + ENTRY(mcount) stmdb sp!, {r0-r3, lr} ldr r0, =ftrace_trace_function -- cgit v1.2.3 From 0d928b0b616d1c5c5fe76019a87cba171ca91633 Mon Sep 17 00:00:00 2001 From: Uwe Kleine-König Date: Thu, 13 Aug 2009 20:38:17 +0200 Subject: Complete irq tracing support for ARM MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Before this patch enabling and disabling irqs in assembler code and by the hardware wasn't tracked completly. I had to transpose two instructions in arch/arm/lib/bitops.h because restore_irqs doesn't preserve the flags with CONFIG_TRACE_IRQFLAGS=y Signed-off-by: Uwe Kleine-König Cc: Russell King Cc: Peter Zijlstra Cc: Ingo Molnar Signed-off-by: Uwe Kleine-König --- arch/arm/include/asm/assembler.h | 49 ++++++++++++++++++++++++++++++++++++---- arch/arm/kernel/entry-armv.S | 10 ++++---- arch/arm/lib/bitops.h | 2 +- 3 files changed, 49 insertions(+), 12 deletions(-) (limited to 'arch/arm/kernel') diff --git a/arch/arm/include/asm/assembler.h b/arch/arm/include/asm/assembler.h index 15f8a092b700..44912cd5da13 100644 --- a/arch/arm/include/asm/assembler.h +++ b/arch/arm/include/asm/assembler.h @@ -74,23 +74,56 @@ * Enable and disable interrupts */ #if __LINUX_ARM_ARCH__ >= 6 - .macro disable_irq + .macro disable_irq_notrace cpsid i .endm - .macro enable_irq + .macro enable_irq_notrace cpsie i .endm #else - .macro disable_irq + .macro disable_irq_notrace msr cpsr_c, #PSR_I_BIT | SVC_MODE .endm - .macro enable_irq + .macro enable_irq_notrace msr cpsr_c, #SVC_MODE .endm #endif + .macro asm_trace_hardirqs_off +#if defined(CONFIG_TRACE_IRQFLAGS) + stmdb sp!, {r0-r3, ip, lr} + bl trace_hardirqs_off + ldmia sp!, {r0-r3, ip, lr} +#endif + .endm + + .macro asm_trace_hardirqs_on_cond, cond +#if defined(CONFIG_TRACE_IRQFLAGS) + /* + * actually the registers should be pushed and pop'd conditionally, but + * after bl the flags are certainly clobbered + */ + stmdb sp!, {r0-r3, ip, lr} + bl\cond trace_hardirqs_on + ldmia sp!, {r0-r3, ip, lr} +#endif + .endm + + .macro asm_trace_hardirqs_on + asm_trace_hardirqs_on_cond al + .endm + + .macro disable_irq + disable_irq_notrace + asm_trace_hardirqs_off + .endm + + .macro enable_irq + asm_trace_hardirqs_on + enable_irq_notrace + .endm /* * Save the current IRQ state and disable IRQs. Note that this macro * assumes FIQs are enabled, and that the processor is in SVC mode. @@ -104,10 +137,16 @@ * Restore interrupt state previously stored in a register. We don't * guarantee that this will preserve the flags. */ - .macro restore_irqs, oldcpsr + .macro restore_irqs_notrace, oldcpsr msr cpsr_c, \oldcpsr .endm + .macro restore_irqs, oldcpsr + tst \oldcpsr, #PSR_I_BIT + asm_trace_hardirqs_on_cond eq + restore_irqs_notrace \oldcpsr + .endm + #define USER(x...) \ 9999: x; \ .section __ex_table,"a"; \ diff --git a/arch/arm/kernel/entry-armv.S b/arch/arm/kernel/entry-armv.S index fc8af43c5000..792abd0dfae1 100644 --- a/arch/arm/kernel/entry-armv.S +++ b/arch/arm/kernel/entry-armv.S @@ -151,6 +151,8 @@ ENDPROC(__und_invalid) @ r4 - orig_r0 (see pt_regs definition in ptrace.h) @ stmia r5, {r0 - r4} + + asm_trace_hardirqs_off .endm .align 5 @@ -206,9 +208,6 @@ ENDPROC(__dabt_svc) __irq_svc: svc_entry -#ifdef CONFIG_TRACE_IRQFLAGS - bl trace_hardirqs_off -#endif #ifdef CONFIG_PREEMPT get_thread_info tsk ldr r8, [tsk, #TI_PREEMPT] @ get preempt count @@ -383,6 +382,8 @@ ENDPROC(__pabt_svc) @ Clear FP to mark the first stack frame @ zero_fp + + asm_trace_hardirqs_off .endm .macro kuser_cmpxchg_check @@ -437,9 +438,6 @@ __irq_usr: usr_entry kuser_cmpxchg_check -#ifdef CONFIG_TRACE_IRQFLAGS - bl trace_hardirqs_off -#endif get_thread_info tsk #ifdef CONFIG_PREEMPT ldr r8, [tsk, #TI_PREEMPT] @ get preempt count diff --git a/arch/arm/lib/bitops.h b/arch/arm/lib/bitops.h index c7f2627385e7..d42252918bfb 100644 --- a/arch/arm/lib/bitops.h +++ b/arch/arm/lib/bitops.h @@ -60,8 +60,8 @@ tst r2, r0, lsl r3 \instr r2, r2, r0, lsl r3 \store r2, [r1] - restore_irqs ip moveq r0, #0 + restore_irqs ip mov pc, lr .endm #endif -- cgit v1.2.3 From 369842658a36bcea28ecb643ba4bdb53919330dd Mon Sep 17 00:00:00 2001 From: Mikael Pettersson Date: Sat, 15 Aug 2009 12:58:11 +0100 Subject: ARM: 5677/1: ARM support for TIF_RESTORE_SIGMASK/pselect6/ppoll/epoll_pwait This patch adds support for TIF_RESTORE_SIGMASK to ARM's signal handling, which allows to hook up the pselect6, ppoll, and epoll_pwait syscalls on ARM. Tested here with eabi userspace and a test program with a deliberate race between a child's exit and the parent's sigprocmask/select sequence. Using sys_pselect6() instead of sigprocmask/select reliably prevents the race. The other arch's support for TIF_RESTORE_SIGMASK has evolved over time: In 2.6.16: - add TIF_RESTORE_SIGMASK which parallels TIF_SIGPENDING - test both when checking for pending signal [changed later] - reimplement sys_sigsuspend() to use current->saved_sigmask, TIF_RESTORE_SIGMASK [changed later], and -ERESTARTNOHAND; ditto for sys_rt_sigsuspend(), but drop private code and use common code via __ARCH_WANT_SYS_RT_SIGSUSPEND; - there are now no "extra" calls to do_signal() so its oldset parameter is always ¤t->blocked so need not be passed, also its return value is changed to void - change handle_signal() to return 0/-errno - change do_signal() to honor TIF_RESTORE_SIGMASK: + get oldset from current->saved_sigmask if TIF_RESTORE_SIGMASK is set + if handle_signal() was successful then clear TIF_RESTORE_SIGMASK + if no signal was delivered and TIF_RESTORE_SIGMASK is set then clear it and restore the sigmask - hook up sys_pselect6() and sys_ppoll() In 2.6.19: - hook up sys_epoll_pwait() In 2.6.26: - allow archs to override how TIF_RESTORE_SIGMASK is implemented; default set_restore_sigmask() sets both TIF_RESTORE_SIGMASK and TIF_SIGPENDING; archs need now just test TIF_SIGPENDING again when checking for pending signal work; some archs now implement TIF_RESTORE_SIGMASK as a secondary/non-atomic thread flag bit - call set_restore_sigmask() in sys_sigsuspend() instead of setting TIF_RESTORE_SIGMASK In 2.6.29-rc: - kill sys_pselect7() which no arch wanted So for 2.6.31-rc6/ARM this patch does the following: - Add TIF_RESTORE_SIGMASK. Use the generic set_restore_sigmask() which sets both TIF_SIGPENDING and TIF_RESTORE_SIGMASK, so TIF_RESTORE_SIGMASK need not claim one of the scarce low thread flags, and existing TIF_SIGPENDING and _TIF_WORK_MASK tests need not be extended for TIF_RESTORE_SIGMASK. - sys_sigsuspend() is reimplemented to use current->saved_sigmask and set_restore_sigmask(), making it identical to most other archs - The private code for sys_rt_sigsuspend() is removed, instead generic code supplies it via __ARCH_WANT_SYS_RT_SIGSUSPEND. - sys_sigsuspend() and sys_rt_sigsuspend() no longer need a pt_regs parameter, so their assembly code wrappers are removed. - handle_signal() is changed to return 0 on success or -errno. - The oldset parameter to do_signal() is now redundant and removed, and the return value is now also redundant and changed to void. - do_signal() is changed to honor TIF_RESTORE_SIGMASK: + get oldset from current->saved_sigmask if TIF_RESTORE_SIGMASK is set + if handle_signal() was successful then clear TIF_RESTORE_SIGMASK + if no signal was delivered and TIF_RESTORE_SIGMASK is set then clear it and restore the sigmask - Hook up sys_pselect6, sys_ppoll, and sys_epoll_pwait. Signed-off-by: Mikael Pettersson Signed-off-by: Russell King --- arch/arm/include/asm/thread_info.h | 2 + arch/arm/include/asm/unistd.h | 7 ++-- arch/arm/kernel/calls.S | 10 ++--- arch/arm/kernel/entry-common.S | 10 ----- arch/arm/kernel/signal.c | 86 ++++++++++++++++---------------------- 5 files changed, 48 insertions(+), 67 deletions(-) (limited to 'arch/arm/kernel') diff --git a/arch/arm/include/asm/thread_info.h b/arch/arm/include/asm/thread_info.h index 73394e50cbca..e20d80539b42 100644 --- a/arch/arm/include/asm/thread_info.h +++ b/arch/arm/include/asm/thread_info.h @@ -140,6 +140,7 @@ extern void vfp_sync_state(struct thread_info *thread); #define TIF_USING_IWMMXT 17 #define TIF_MEMDIE 18 #define TIF_FREEZE 19 +#define TIF_RESTORE_SIGMASK 20 #define _TIF_SIGPENDING (1 << TIF_SIGPENDING) #define _TIF_NEED_RESCHED (1 << TIF_NEED_RESCHED) @@ -147,6 +148,7 @@ extern void vfp_sync_state(struct thread_info *thread); #define _TIF_POLLING_NRFLAG (1 << TIF_POLLING_NRFLAG) #define _TIF_USING_IWMMXT (1 << TIF_USING_IWMMXT) #define _TIF_FREEZE (1 << TIF_FREEZE) +#define _TIF_RESTORE_SIGMASK (1 << TIF_RESTORE_SIGMASK) /* * Change these and you break ASM code in entry-common.S diff --git a/arch/arm/include/asm/unistd.h b/arch/arm/include/asm/unistd.h index 0e97b8cb77d5..9122c9ee18fb 100644 --- a/arch/arm/include/asm/unistd.h +++ b/arch/arm/include/asm/unistd.h @@ -360,8 +360,8 @@ #define __NR_readlinkat (__NR_SYSCALL_BASE+332) #define __NR_fchmodat (__NR_SYSCALL_BASE+333) #define __NR_faccessat (__NR_SYSCALL_BASE+334) - /* 335 for pselect6 */ - /* 336 for ppoll */ +#define __NR_pselect6 (__NR_SYSCALL_BASE+335) +#define __NR_ppoll (__NR_SYSCALL_BASE+336) #define __NR_unshare (__NR_SYSCALL_BASE+337) #define __NR_set_robust_list (__NR_SYSCALL_BASE+338) #define __NR_get_robust_list (__NR_SYSCALL_BASE+339) @@ -372,7 +372,7 @@ #define __NR_vmsplice (__NR_SYSCALL_BASE+343) #define __NR_move_pages (__NR_SYSCALL_BASE+344) #define __NR_getcpu (__NR_SYSCALL_BASE+345) - /* 346 for epoll_pwait */ +#define __NR_epoll_pwait (__NR_SYSCALL_BASE+346) #define __NR_kexec_load (__NR_SYSCALL_BASE+347) #define __NR_utimensat (__NR_SYSCALL_BASE+348) #define __NR_signalfd (__NR_SYSCALL_BASE+349) @@ -432,6 +432,7 @@ #define __ARCH_WANT_SYS_SIGPENDING #define __ARCH_WANT_SYS_SIGPROCMASK #define __ARCH_WANT_SYS_RT_SIGACTION +#define __ARCH_WANT_SYS_RT_SIGSUSPEND #if !defined(CONFIG_AEABI) || defined(CONFIG_OABI_COMPAT) #define __ARCH_WANT_SYS_TIME diff --git a/arch/arm/kernel/calls.S b/arch/arm/kernel/calls.S index f776e72a4cb8..ecfa98954d1d 100644 --- a/arch/arm/kernel/calls.S +++ b/arch/arm/kernel/calls.S @@ -81,7 +81,7 @@ CALL(sys_ni_syscall) /* was sys_ssetmask */ /* 70 */ CALL(sys_setreuid16) CALL(sys_setregid16) - CALL(sys_sigsuspend_wrapper) + CALL(sys_sigsuspend) CALL(sys_sigpending) CALL(sys_sethostname) /* 75 */ CALL(sys_setrlimit) @@ -188,7 +188,7 @@ CALL(sys_rt_sigpending) CALL(sys_rt_sigtimedwait) CALL(sys_rt_sigqueueinfo) - CALL(sys_rt_sigsuspend_wrapper) + CALL(sys_rt_sigsuspend) /* 180 */ CALL(ABI(sys_pread64, sys_oabi_pread64)) CALL(ABI(sys_pwrite64, sys_oabi_pwrite64)) CALL(sys_chown16) @@ -344,8 +344,8 @@ CALL(sys_readlinkat) CALL(sys_fchmodat) CALL(sys_faccessat) -/* 335 */ CALL(sys_ni_syscall) /* eventually pselect6 */ - CALL(sys_ni_syscall) /* eventually ppoll */ +/* 335 */ CALL(sys_pselect6) + CALL(sys_ppoll) CALL(sys_unshare) CALL(sys_set_robust_list) CALL(sys_get_robust_list) @@ -355,7 +355,7 @@ CALL(sys_vmsplice) CALL(sys_move_pages) /* 345 */ CALL(sys_getcpu) - CALL(sys_ni_syscall) /* eventually epoll_pwait */ + CALL(sys_epoll_pwait) CALL(sys_kexec_load) CALL(sys_utimensat) CALL(sys_signalfd) diff --git a/arch/arm/kernel/entry-common.S b/arch/arm/kernel/entry-common.S index 366e5097a41a..bfa7f0af7ede 100644 --- a/arch/arm/kernel/entry-common.S +++ b/arch/arm/kernel/entry-common.S @@ -373,16 +373,6 @@ sys_clone_wrapper: b sys_clone ENDPROC(sys_clone_wrapper) -sys_sigsuspend_wrapper: - add r3, sp, #S_OFF - b sys_sigsuspend -ENDPROC(sys_sigsuspend_wrapper) - -sys_rt_sigsuspend_wrapper: - add r2, sp, #S_OFF - b sys_rt_sigsuspend -ENDPROC(sys_rt_sigsuspend_wrapper) - sys_sigreturn_wrapper: add r0, sp, #S_OFF b sys_sigreturn diff --git a/arch/arm/kernel/signal.c b/arch/arm/kernel/signal.c index 93bb4247b7ed..e27ee1f701d5 100644 --- a/arch/arm/kernel/signal.c +++ b/arch/arm/kernel/signal.c @@ -47,57 +47,22 @@ const unsigned long sigreturn_codes[7] = { MOV_R7_NR_RT_SIGRETURN, SWI_SYS_RT_SIGRETURN, SWI_THUMB_RT_SIGRETURN, }; -static int do_signal(sigset_t *oldset, struct pt_regs * regs, int syscall); - /* * atomically swap in the new signal mask, and wait for a signal. */ -asmlinkage int sys_sigsuspend(int restart, unsigned long oldmask, old_sigset_t mask, struct pt_regs *regs) +asmlinkage int sys_sigsuspend(int restart, unsigned long oldmask, old_sigset_t mask) { - sigset_t saveset; - mask &= _BLOCKABLE; spin_lock_irq(¤t->sighand->siglock); - saveset = current->blocked; + current->saved_sigmask = current->blocked; siginitset(¤t->blocked, mask); recalc_sigpending(); spin_unlock_irq(¤t->sighand->siglock); - regs->ARM_r0 = -EINTR; - - while (1) { - current->state = TASK_INTERRUPTIBLE; - schedule(); - if (do_signal(&saveset, regs, 0)) - return regs->ARM_r0; - } -} - -asmlinkage int -sys_rt_sigsuspend(sigset_t __user *unewset, size_t sigsetsize, struct pt_regs *regs) -{ - sigset_t saveset, newset; - - /* XXX: Don't preclude handling different sized sigset_t's. */ - if (sigsetsize != sizeof(sigset_t)) - return -EINVAL; - - if (copy_from_user(&newset, unewset, sizeof(newset))) - return -EFAULT; - sigdelsetmask(&newset, ~_BLOCKABLE); - - spin_lock_irq(¤t->sighand->siglock); - saveset = current->blocked; - current->blocked = newset; - recalc_sigpending(); - spin_unlock_irq(¤t->sighand->siglock); - regs->ARM_r0 = -EINTR; - while (1) { - current->state = TASK_INTERRUPTIBLE; - schedule(); - if (do_signal(&saveset, regs, 0)) - return regs->ARM_r0; - } + current->state = TASK_INTERRUPTIBLE; + schedule(); + set_restore_sigmask(); + return -ERESTARTNOHAND; } asmlinkage int @@ -545,7 +510,7 @@ static inline void setup_syscall_restart(struct pt_regs *regs) /* * OK, we're invoking a handler */ -static void +static int handle_signal(unsigned long sig, struct k_sigaction *ka, siginfo_t *info, sigset_t *oldset, struct pt_regs * regs, int syscall) @@ -596,7 +561,7 @@ handle_signal(unsigned long sig, struct k_sigaction *ka, if (ret != 0) { force_sigsegv(sig, tsk); - return; + return ret; } /* @@ -610,6 +575,7 @@ handle_signal(unsigned long sig, struct k_sigaction *ka, recalc_sigpending(); spin_unlock_irq(&tsk->sighand->siglock); + return 0; } /* @@ -621,7 +587,7 @@ handle_signal(unsigned long sig, struct k_sigaction *ka, * the kernel can handle, and then we build all the user-level signal handling * stack-frames in one go after that. */ -static int do_signal(sigset_t *oldset, struct pt_regs *regs, int syscall) +static void do_signal(struct pt_regs *regs, int syscall) { struct k_sigaction ka; siginfo_t info; @@ -634,7 +600,7 @@ static int do_signal(sigset_t *oldset, struct pt_regs *regs, int syscall) * if so. */ if (!user_mode(regs)) - return 0; + return; if (try_to_freeze()) goto no_signal; @@ -643,9 +609,24 @@ static int do_signal(sigset_t *oldset, struct pt_regs *regs, int syscall) signr = get_signal_to_deliver(&info, &ka, regs, NULL); if (signr > 0) { - handle_signal(signr, &ka, &info, oldset, regs, syscall); + sigset_t *oldset; + + if (test_thread_flag(TIF_RESTORE_SIGMASK)) + oldset = ¤t->saved_sigmask; + else + oldset = ¤t->blocked; + if (handle_signal(signr, &ka, &info, oldset, regs, syscall) == 0) { + /* + * A signal was successfully delivered; the saved + * sigmask will have been stored in the signal frame, + * and will be restored by sigreturn, so we can simply + * clear the TIF_RESTORE_SIGMASK flag. + */ + if (test_thread_flag(TIF_RESTORE_SIGMASK)) + clear_thread_flag(TIF_RESTORE_SIGMASK); + } single_step_set(current); - return 1; + return; } no_signal: @@ -697,14 +678,21 @@ static int do_signal(sigset_t *oldset, struct pt_regs *regs, int syscall) regs->ARM_r0 == -ERESTARTNOINTR) { setup_syscall_restart(regs); } + + /* If there's no signal to deliver, we just put the saved sigmask + * back. + */ + if (test_thread_flag(TIF_RESTORE_SIGMASK)) { + clear_thread_flag(TIF_RESTORE_SIGMASK); + sigprocmask(SIG_SETMASK, ¤t->saved_sigmask, NULL); + } } single_step_set(current); - return 0; } asmlinkage void do_notify_resume(struct pt_regs *regs, unsigned int thread_flags, int syscall) { if (thread_flags & _TIF_SIGPENDING) - do_signal(¤t->blocked, regs, syscall); + do_signal(regs, syscall); } -- cgit v1.2.3 From baa28e3530375e0bef2c53243634a1c78f5c02f3 Mon Sep 17 00:00:00 2001 From: Ben Dooks Date: Mon, 3 Aug 2009 15:11:29 +0100 Subject: ARM: Show FIQ in /proc/interrupts on CONFIG_FIQ The show_fiq_list() call in arch/arm/kernel/irq.c currently depends on CONFIG_ARCH_ACORN, but this is not the only architecture that supports the usage of FIQ. Change to calling this if CONFIG_FIQ is set (which is what arch/arm/kernel/fiq.c is built by). Signed-off-by: Ben Dooks Signed-off-by: Ben Dooks --- arch/arm/kernel/irq.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/arm/kernel') diff --git a/arch/arm/kernel/irq.c b/arch/arm/kernel/irq.c index b7c3490eaa24..c9a8619f3856 100644 --- a/arch/arm/kernel/irq.c +++ b/arch/arm/kernel/irq.c @@ -86,7 +86,7 @@ int show_interrupts(struct seq_file *p, void *v) unlock: spin_unlock_irqrestore(&irq_desc[i].lock, flags); } else if (i == NR_IRQS) { -#ifdef CONFIG_ARCH_ACORN +#ifdef CONFIG_FIQ show_fiq_list(p, v); #endif #ifdef CONFIG_SMP -- cgit v1.2.3 From d0420c83f39f79afb82010c2d2cafd150eef651b Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 2 Sep 2009 09:14:16 +0100 Subject: KEYS: Extend TIF_NOTIFY_RESUME to (almost) all architectures [try #6] Implement TIF_NOTIFY_RESUME for most of those architectures in which isn't yet available, and, whilst we're at it, have it call the appropriate tracehook. After this patch, blackfin, m68k* and xtensa still lack support and need alteration of assembly code to make it work. Resume notification can then be used (by a later patch) to install a new session keyring on the parent of a process. Signed-off-by: David Howells Acked-by: Russell King cc: linux-arch@vger.kernel.org Signed-off-by: James Morris --- arch/alpha/include/asm/thread_info.h | 5 ++++- arch/alpha/kernel/signal.c | 5 +++++ arch/arm/include/asm/thread_info.h | 3 +++ arch/arm/kernel/entry-common.S | 2 +- arch/arm/kernel/signal.c | 5 +++++ arch/avr32/include/asm/thread_info.h | 6 +++++- arch/avr32/kernel/entry-avr32b.S | 2 +- arch/avr32/kernel/signal.c | 5 +++++ arch/cris/kernel/ptrace.c | 5 +++++ arch/h8300/include/asm/thread_info.h | 2 ++ arch/h8300/kernel/signal.c | 5 +++++ arch/m32r/include/asm/thread_info.h | 2 ++ arch/m32r/kernel/signal.c | 5 +++++ arch/mips/include/asm/thread_info.h | 2 ++ arch/mips/kernel/signal.c | 5 +++++ arch/parisc/include/asm/thread_info.h | 4 +++- arch/parisc/kernel/entry.S | 2 +- arch/parisc/kernel/signal.c | 5 +++++ 18 files changed, 64 insertions(+), 6 deletions(-) (limited to 'arch/arm/kernel') diff --git a/arch/alpha/include/asm/thread_info.h b/arch/alpha/include/asm/thread_info.h index 60c83abfde70..5076a8860b18 100644 --- a/arch/alpha/include/asm/thread_info.h +++ b/arch/alpha/include/asm/thread_info.h @@ -75,6 +75,7 @@ register struct thread_info *__current_thread_info __asm__("$8"); #define TIF_UAC_SIGBUS 7 #define TIF_MEMDIE 8 #define TIF_RESTORE_SIGMASK 9 /* restore signal mask in do_signal */ +#define TIF_NOTIFY_RESUME 10 /* callback before returning to user */ #define TIF_FREEZE 16 /* is freezing for suspend */ #define _TIF_SYSCALL_TRACE (1<blocked, regs, syscall); + + if (thread_flags & _TIF_NOTIFY_RESUME) { + clear_thread_flag(TIF_NOTIFY_RESUME); + tracehook_notify_resume(regs); + } } diff --git a/arch/avr32/include/asm/thread_info.h b/arch/avr32/include/asm/thread_info.h index fc42de5ca209..fd0c5d7e9337 100644 --- a/arch/avr32/include/asm/thread_info.h +++ b/arch/avr32/include/asm/thread_info.h @@ -84,6 +84,7 @@ static inline struct thread_info *current_thread_info(void) #define TIF_MEMDIE 6 #define TIF_RESTORE_SIGMASK 7 /* restore signal mask in do_signal */ #define TIF_CPU_GOING_TO_SLEEP 8 /* CPU is entering sleep 0 mode */ +#define TIF_NOTIFY_RESUME 9 /* callback before returning to user */ #define TIF_FREEZE 29 #define TIF_DEBUG 30 /* debugging enabled */ #define TIF_USERSPACE 31 /* true if FS sets userspace */ @@ -96,6 +97,7 @@ static inline struct thread_info *current_thread_info(void) #define _TIF_MEMDIE (1 << TIF_MEMDIE) #define _TIF_RESTORE_SIGMASK (1 << TIF_RESTORE_SIGMASK) #define _TIF_CPU_GOING_TO_SLEEP (1 << TIF_CPU_GOING_TO_SLEEP) +#define _TIF_NOTIFY_RESUME (1 << TIF_NOTIFY_RESUME) #define _TIF_FREEZE (1 << TIF_FREEZE) /* Note: The masks below must never span more than 16 bits! */ @@ -103,13 +105,15 @@ static inline struct thread_info *current_thread_info(void) /* work to do on interrupt/exception return */ #define _TIF_WORK_MASK \ ((1 << TIF_SIGPENDING) \ + | _TIF_NOTIFY_RESUME \ | (1 << TIF_NEED_RESCHED) \ | (1 << TIF_POLLING_NRFLAG) \ | (1 << TIF_BREAKPOINT) \ | (1 << TIF_RESTORE_SIGMASK)) /* work to do on any return to userspace */ -#define _TIF_ALLWORK_MASK (_TIF_WORK_MASK | (1 << TIF_SYSCALL_TRACE)) +#define _TIF_ALLWORK_MASK (_TIF_WORK_MASK | (1 << TIF_SYSCALL_TRACE) | \ + _TIF_NOTIFY_RESUME) /* work to do on return from debug mode */ #define _TIF_DBGWORK_MASK (_TIF_WORK_MASK & ~(1 << TIF_BREAKPOINT)) diff --git a/arch/avr32/kernel/entry-avr32b.S b/arch/avr32/kernel/entry-avr32b.S index 009a80155d67..169268c40ae2 100644 --- a/arch/avr32/kernel/entry-avr32b.S +++ b/arch/avr32/kernel/entry-avr32b.S @@ -281,7 +281,7 @@ syscall_exit_work: ld.w r1, r0[TI_flags] rjmp 1b -2: mov r2, _TIF_SIGPENDING | _TIF_RESTORE_SIGMASK +2: mov r2, _TIF_SIGPENDING | _TIF_RESTORE_SIGMASK | _TIF_NOTIFY_RESUME tst r1, r2 breq 3f unmask_interrupts diff --git a/arch/avr32/kernel/signal.c b/arch/avr32/kernel/signal.c index 27227561bad6..62d242e2d033 100644 --- a/arch/avr32/kernel/signal.c +++ b/arch/avr32/kernel/signal.c @@ -322,4 +322,9 @@ asmlinkage void do_notify_resume(struct pt_regs *regs, struct thread_info *ti) if (ti->flags & (_TIF_SIGPENDING | _TIF_RESTORE_SIGMASK)) do_signal(regs, ¤t->blocked, syscall); + + if (ti->flags & _TIF_NOTIFY_RESUME) { + clear_thread_flag(TIF_NOTIFY_RESUME); + tracehook_notify_resume(regs); + } } diff --git a/arch/cris/kernel/ptrace.c b/arch/cris/kernel/ptrace.c index b326023baab2..4f06d7fb43dd 100644 --- a/arch/cris/kernel/ptrace.c +++ b/arch/cris/kernel/ptrace.c @@ -36,4 +36,9 @@ void do_notify_resume(int canrestart, struct pt_regs *regs, /* deal with pending signal delivery */ if (thread_info_flags & _TIF_SIGPENDING) do_signal(canrestart,regs); + + if (thread_info_flags & _TIF_NOTIFY_RESUME) { + clear_thread_flag(TIF_NOTIFY_RESUME); + tracehook_notify_resume(regs); + } } diff --git a/arch/h8300/include/asm/thread_info.h b/arch/h8300/include/asm/thread_info.h index 8bbc8b0ee45d..70e67e47d020 100644 --- a/arch/h8300/include/asm/thread_info.h +++ b/arch/h8300/include/asm/thread_info.h @@ -89,6 +89,7 @@ static inline struct thread_info *current_thread_info(void) TIF_NEED_RESCHED */ #define TIF_MEMDIE 4 #define TIF_RESTORE_SIGMASK 5 /* restore signal mask in do_signal() */ +#define TIF_NOTIFY_RESUME 6 /* callback before returning to user */ #define TIF_FREEZE 16 /* is freezing for suspend */ /* as above, but as bit values */ @@ -97,6 +98,7 @@ static inline struct thread_info *current_thread_info(void) #define _TIF_NEED_RESCHED (1<) %r19, %r20, %r0 b,n intr_restore /* skip past if we've nothing to do */ diff --git a/arch/parisc/kernel/signal.c b/arch/parisc/kernel/signal.c index f82544225e8e..b3bfc4326703 100644 --- a/arch/parisc/kernel/signal.c +++ b/arch/parisc/kernel/signal.c @@ -645,4 +645,9 @@ void do_notify_resume(struct pt_regs *regs, long in_syscall) if (test_thread_flag(TIF_SIGPENDING) || test_thread_flag(TIF_RESTORE_SIGMASK)) do_signal(regs, in_syscall); + + if (test_thread_flag(TIF_NOTIFY_RESUME)) { + clear_thread_flag(TIF_NOTIFY_RESUME); + tracehook_notify_resume(regs); + } } -- cgit v1.2.3 From ee18d64c1f632043a02e6f5ba5e045bb26a5465f Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 2 Sep 2009 09:14:21 +0100 Subject: KEYS: Add a keyctl to install a process's session keyring on its parent [try #6] Add a keyctl to install a process's session keyring onto its parent. This replaces the parent's session keyring. Because the COW credential code does not permit one process to change another process's credentials directly, the change is deferred until userspace next starts executing again. Normally this will be after a wait*() syscall. To support this, three new security hooks have been provided: cred_alloc_blank() to allocate unset security creds, cred_transfer() to fill in the blank security creds and key_session_to_parent() - which asks the LSM if the process may replace its parent's session keyring. The replacement may only happen if the process has the same ownership details as its parent, and the process has LINK permission on the session keyring, and the session keyring is owned by the process, and the LSM permits it. Note that this requires alteration to each architecture's notify_resume path. This has been done for all arches barring blackfin, m68k* and xtensa, all of which need assembly alteration to support TIF_NOTIFY_RESUME. This allows the replacement to be performed at the point the parent process resumes userspace execution. This allows the userspace AFS pioctl emulation to fully emulate newpag() and the VIOCSETTOK and VIOCSETTOK2 pioctls, all of which require the ability to alter the parent process's PAG membership. However, since kAFS doesn't use PAGs per se, but rather dumps the keys into the session keyring, the session keyring of the parent must be replaced if, for example, VIOCSETTOK is passed the newpag flag. This can be tested with the following program: #include #include #include #define KEYCTL_SESSION_TO_PARENT 18 #define OSERROR(X, S) do { if ((long)(X) == -1) { perror(S); exit(1); } } while(0) int main(int argc, char **argv) { key_serial_t keyring, key; long ret; keyring = keyctl_join_session_keyring(argv[1]); OSERROR(keyring, "keyctl_join_session_keyring"); key = add_key("user", "a", "b", 1, keyring); OSERROR(key, "add_key"); ret = keyctl(KEYCTL_SESSION_TO_PARENT); OSERROR(ret, "KEYCTL_SESSION_TO_PARENT"); return 0; } Compiled and linked with -lkeyutils, you should see something like: [dhowells@andromeda ~]$ keyctl show Session Keyring -3 --alswrv 4043 4043 keyring: _ses 355907932 --alswrv 4043 -1 \_ keyring: _uid.4043 [dhowells@andromeda ~]$ /tmp/newpag [dhowells@andromeda ~]$ keyctl show Session Keyring -3 --alswrv 4043 4043 keyring: _ses 1055658746 --alswrv 4043 4043 \_ user: a [dhowells@andromeda ~]$ /tmp/newpag hello [dhowells@andromeda ~]$ keyctl show Session Keyring -3 --alswrv 4043 4043 keyring: hello 340417692 --alswrv 4043 4043 \_ user: a Where the test program creates a new session keyring, sticks a user key named 'a' into it and then installs it on its parent. Signed-off-by: David Howells Signed-off-by: James Morris --- Documentation/keys.txt | 20 +++++++++ arch/alpha/kernel/signal.c | 2 + arch/arm/kernel/signal.c | 2 + arch/avr32/kernel/signal.c | 2 + arch/cris/kernel/ptrace.c | 2 + arch/frv/kernel/signal.c | 2 + arch/h8300/kernel/signal.c | 2 + arch/ia64/kernel/process.c | 2 + arch/m32r/kernel/signal.c | 2 + arch/mips/kernel/signal.c | 2 + arch/mn10300/kernel/signal.c | 2 + arch/parisc/kernel/signal.c | 2 + arch/s390/kernel/signal.c | 2 + arch/sh/kernel/signal_32.c | 2 + arch/sh/kernel/signal_64.c | 2 + arch/sparc/kernel/signal_32.c | 2 + arch/sparc/kernel/signal_64.c | 3 ++ arch/x86/kernel/signal.c | 2 + include/linux/cred.h | 1 + include/linux/key.h | 3 ++ include/linux/keyctl.h | 1 + include/linux/sched.h | 1 + include/linux/security.h | 38 ++++++++++++++++ kernel/cred.c | 43 ++++++++++++++++++ security/capability.c | 19 ++++++++ security/keys/compat.c | 3 ++ security/keys/gc.c | 1 + security/keys/internal.h | 1 + security/keys/keyctl.c | 102 ++++++++++++++++++++++++++++++++++++++++++ security/keys/process_keys.c | 49 ++++++++++++++++++++ security/security.c | 17 +++++++ security/selinux/hooks.c | 28 ++++++++++++ security/smack/smack_lsm.c | 30 +++++++++++++ security/tomoyo/tomoyo.c | 17 +++++++ 34 files changed, 409 insertions(+) (limited to 'arch/arm/kernel') diff --git a/Documentation/keys.txt b/Documentation/keys.txt index 203487e9b1d8..e4dbbdb1bd96 100644 --- a/Documentation/keys.txt +++ b/Documentation/keys.txt @@ -757,6 +757,26 @@ The keyctl syscall functions are: successful. + (*) Install the calling process's session keyring on its parent. + + long keyctl(KEYCTL_SESSION_TO_PARENT); + + This functions attempts to install the calling process's session keyring + on to the calling process's parent, replacing the parent's current session + keyring. + + The calling process must have the same ownership as its parent, the + keyring must have the same ownership as the calling process, the calling + process must have LINK permission on the keyring and the active LSM module + mustn't deny permission, otherwise error EPERM will be returned. + + Error ENOMEM will be returned if there was insufficient memory to complete + the operation, otherwise 0 will be returned to indicate success. + + The keyring will be replaced next time the parent process leaves the + kernel and resumes executing userspace. + + =============== KERNEL SERVICES =============== diff --git a/arch/alpha/kernel/signal.c b/arch/alpha/kernel/signal.c index 04e17c1f0f1b..d91aaa747050 100644 --- a/arch/alpha/kernel/signal.c +++ b/arch/alpha/kernel/signal.c @@ -687,5 +687,7 @@ do_notify_resume(struct pt_regs *regs, struct switch_stack *sw, if (thread_info_flags & _TIF_NOTIFY_RESUME) { clear_thread_flag(TIF_NOTIFY_RESUME); tracehook_notify_resume(regs); + if (current->replacement_session_keyring) + key_replace_session_keyring(); } } diff --git a/arch/arm/kernel/signal.c b/arch/arm/kernel/signal.c index 13dec276927a..ea4ad3a43c88 100644 --- a/arch/arm/kernel/signal.c +++ b/arch/arm/kernel/signal.c @@ -711,5 +711,7 @@ do_notify_resume(struct pt_regs *regs, unsigned int thread_flags, int syscall) if (thread_flags & _TIF_NOTIFY_RESUME) { clear_thread_flag(TIF_NOTIFY_RESUME); tracehook_notify_resume(regs); + if (current->replacement_session_keyring) + key_replace_session_keyring(); } } diff --git a/arch/avr32/kernel/signal.c b/arch/avr32/kernel/signal.c index 62d242e2d033..de9f7fe2aefa 100644 --- a/arch/avr32/kernel/signal.c +++ b/arch/avr32/kernel/signal.c @@ -326,5 +326,7 @@ asmlinkage void do_notify_resume(struct pt_regs *regs, struct thread_info *ti) if (ti->flags & _TIF_NOTIFY_RESUME) { clear_thread_flag(TIF_NOTIFY_RESUME); tracehook_notify_resume(regs); + if (current->replacement_session_keyring) + key_replace_session_keyring(); } } diff --git a/arch/cris/kernel/ptrace.c b/arch/cris/kernel/ptrace.c index 4f06d7fb43dd..32e9d5ee895b 100644 --- a/arch/cris/kernel/ptrace.c +++ b/arch/cris/kernel/ptrace.c @@ -40,5 +40,7 @@ void do_notify_resume(int canrestart, struct pt_regs *regs, if (thread_info_flags & _TIF_NOTIFY_RESUME) { clear_thread_flag(TIF_NOTIFY_RESUME); tracehook_notify_resume(regs); + if (current->replacement_session_keyring) + key_replace_session_keyring(); } } diff --git a/arch/frv/kernel/signal.c b/arch/frv/kernel/signal.c index 4a7a62c6e783..6b0a2b6fed6a 100644 --- a/arch/frv/kernel/signal.c +++ b/arch/frv/kernel/signal.c @@ -572,6 +572,8 @@ asmlinkage void do_notify_resume(__u32 thread_info_flags) if (thread_info_flags & _TIF_NOTIFY_RESUME) { clear_thread_flag(TIF_NOTIFY_RESUME); tracehook_notify_resume(__frame); + if (current->replacement_session_keyring) + key_replace_session_keyring(); } } /* end do_notify_resume() */ diff --git a/arch/h8300/kernel/signal.c b/arch/h8300/kernel/signal.c index 56b3ab7dbbb0..abac3ee8c52a 100644 --- a/arch/h8300/kernel/signal.c +++ b/arch/h8300/kernel/signal.c @@ -556,5 +556,7 @@ asmlinkage void do_notify_resume(struct pt_regs *regs, u32 thread_info_flags) if (thread_info_flags & _TIF_NOTIFY_RESUME) { clear_thread_flag(TIF_NOTIFY_RESUME); tracehook_notify_resume(regs); + if (current->replacement_session_keyring) + key_replace_session_keyring(); } } diff --git a/arch/ia64/kernel/process.c b/arch/ia64/kernel/process.c index 5d7c0e5b9e76..89969e950045 100644 --- a/arch/ia64/kernel/process.c +++ b/arch/ia64/kernel/process.c @@ -192,6 +192,8 @@ do_notify_resume_user(sigset_t *unused, struct sigscratch *scr, long in_syscall) if (test_thread_flag(TIF_NOTIFY_RESUME)) { clear_thread_flag(TIF_NOTIFY_RESUME); tracehook_notify_resume(&scr->pt); + if (current->replacement_session_keyring) + key_replace_session_keyring(); } /* copy user rbs to kernel rbs */ diff --git a/arch/m32r/kernel/signal.c b/arch/m32r/kernel/signal.c index 3220258be188..f80bac17c65c 100644 --- a/arch/m32r/kernel/signal.c +++ b/arch/m32r/kernel/signal.c @@ -411,6 +411,8 @@ void do_notify_resume(struct pt_regs *regs, sigset_t *oldset, if (thread_info_flags & _TIF_NOTIFY_RESUME) { clear_thread_flag(TIF_NOTIFY_RESUME); tracehook_notify_resume(regs); + if (current->replacement_session_keyring) + key_replace_session_keyring(); } clear_thread_flag(TIF_IRET); diff --git a/arch/mips/kernel/signal.c b/arch/mips/kernel/signal.c index a3d1015471de..c2acf31874a4 100644 --- a/arch/mips/kernel/signal.c +++ b/arch/mips/kernel/signal.c @@ -704,5 +704,7 @@ asmlinkage void do_notify_resume(struct pt_regs *regs, void *unused, if (thread_info_flags & _TIF_NOTIFY_RESUME) { clear_thread_flag(TIF_NOTIFY_RESUME); tracehook_notify_resume(regs); + if (current->replacement_session_keyring) + key_replace_session_keyring(); } } diff --git a/arch/mn10300/kernel/signal.c b/arch/mn10300/kernel/signal.c index feb2f2e810db..a21f43bc68e2 100644 --- a/arch/mn10300/kernel/signal.c +++ b/arch/mn10300/kernel/signal.c @@ -568,5 +568,7 @@ asmlinkage void do_notify_resume(struct pt_regs *regs, u32 thread_info_flags) if (thread_info_flags & _TIF_NOTIFY_RESUME) { clear_thread_flag(TIF_NOTIFY_RESUME); tracehook_notify_resume(__frame); + if (current->replacement_session_keyring) + key_replace_session_keyring(); } } diff --git a/arch/parisc/kernel/signal.c b/arch/parisc/kernel/signal.c index b3bfc4326703..5ca1c02b805a 100644 --- a/arch/parisc/kernel/signal.c +++ b/arch/parisc/kernel/signal.c @@ -649,5 +649,7 @@ void do_notify_resume(struct pt_regs *regs, long in_syscall) if (test_thread_flag(TIF_NOTIFY_RESUME)) { clear_thread_flag(TIF_NOTIFY_RESUME); tracehook_notify_resume(regs); + if (current->replacement_session_keyring) + key_replace_session_keyring(); } } diff --git a/arch/s390/kernel/signal.c b/arch/s390/kernel/signal.c index 062bd64e65fa..6b4fef877f9d 100644 --- a/arch/s390/kernel/signal.c +++ b/arch/s390/kernel/signal.c @@ -536,4 +536,6 @@ void do_notify_resume(struct pt_regs *regs) { clear_thread_flag(TIF_NOTIFY_RESUME); tracehook_notify_resume(regs); + if (current->replacement_session_keyring) + key_replace_session_keyring(); } diff --git a/arch/sh/kernel/signal_32.c b/arch/sh/kernel/signal_32.c index b5afbec1db59..04a21883f327 100644 --- a/arch/sh/kernel/signal_32.c +++ b/arch/sh/kernel/signal_32.c @@ -640,5 +640,7 @@ asmlinkage void do_notify_resume(struct pt_regs *regs, unsigned int save_r0, if (thread_info_flags & _TIF_NOTIFY_RESUME) { clear_thread_flag(TIF_NOTIFY_RESUME); tracehook_notify_resume(regs); + if (current->replacement_session_keyring) + key_replace_session_keyring(); } } diff --git a/arch/sh/kernel/signal_64.c b/arch/sh/kernel/signal_64.c index 0663a0ee6021..9e5c9b1d7e98 100644 --- a/arch/sh/kernel/signal_64.c +++ b/arch/sh/kernel/signal_64.c @@ -772,5 +772,7 @@ asmlinkage void do_notify_resume(struct pt_regs *regs, unsigned long thread_info if (thread_info_flags & _TIF_NOTIFY_RESUME) { clear_thread_flag(TIF_NOTIFY_RESUME); tracehook_notify_resume(regs); + if (current->replacement_session_keyring) + key_replace_session_keyring(); } } diff --git a/arch/sparc/kernel/signal_32.c b/arch/sparc/kernel/signal_32.c index 181d069a2d44..7ce1a1005b1d 100644 --- a/arch/sparc/kernel/signal_32.c +++ b/arch/sparc/kernel/signal_32.c @@ -590,6 +590,8 @@ void do_notify_resume(struct pt_regs *regs, unsigned long orig_i0, if (thread_info_flags & _TIF_NOTIFY_RESUME) { clear_thread_flag(TIF_NOTIFY_RESUME); tracehook_notify_resume(regs); + if (current->replacement_session_keyring) + key_replace_session_keyring(); } } diff --git a/arch/sparc/kernel/signal_64.c b/arch/sparc/kernel/signal_64.c index ec82d76dc6f2..647afbda7ae1 100644 --- a/arch/sparc/kernel/signal_64.c +++ b/arch/sparc/kernel/signal_64.c @@ -613,5 +613,8 @@ void do_notify_resume(struct pt_regs *regs, unsigned long orig_i0, unsigned long if (thread_info_flags & _TIF_NOTIFY_RESUME) { clear_thread_flag(TIF_NOTIFY_RESUME); tracehook_notify_resume(regs); + if (current->replacement_session_keyring) + key_replace_session_keyring(); } } + diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c index 4c578751e94e..81e58238c4ce 100644 --- a/arch/x86/kernel/signal.c +++ b/arch/x86/kernel/signal.c @@ -869,6 +869,8 @@ do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags) if (thread_info_flags & _TIF_NOTIFY_RESUME) { clear_thread_flag(TIF_NOTIFY_RESUME); tracehook_notify_resume(regs); + if (current->replacement_session_keyring) + key_replace_session_keyring(); } #ifdef CONFIG_X86_32 diff --git a/include/linux/cred.h b/include/linux/cred.h index 85439abdbc80..24520a539c6f 100644 --- a/include/linux/cred.h +++ b/include/linux/cred.h @@ -152,6 +152,7 @@ struct cred { extern void __put_cred(struct cred *); extern void exit_creds(struct task_struct *); extern int copy_creds(struct task_struct *, unsigned long); +extern struct cred *cred_alloc_blank(void); extern struct cred *prepare_creds(void); extern struct cred *prepare_exec_creds(void); extern struct cred *prepare_usermodehelper_creds(void); diff --git a/include/linux/key.h b/include/linux/key.h index 33e0165de100..cd50dfa1d4c2 100644 --- a/include/linux/key.h +++ b/include/linux/key.h @@ -278,6 +278,8 @@ static inline key_serial_t key_serial(struct key *key) extern ctl_table key_sysctls[]; #endif +extern void key_replace_session_keyring(void); + /* * the userspace interface */ @@ -300,6 +302,7 @@ extern void key_init(void); #define key_fsuid_changed(t) do { } while(0) #define key_fsgid_changed(t) do { } while(0) #define key_init() do { } while(0) +#define key_replace_session_keyring() do { } while(0) #endif /* CONFIG_KEYS */ #endif /* __KERNEL__ */ diff --git a/include/linux/keyctl.h b/include/linux/keyctl.h index c0688eb72093..bd383f1944fb 100644 --- a/include/linux/keyctl.h +++ b/include/linux/keyctl.h @@ -52,5 +52,6 @@ #define KEYCTL_SET_TIMEOUT 15 /* set key timeout */ #define KEYCTL_ASSUME_AUTHORITY 16 /* assume request_key() authorisation */ #define KEYCTL_GET_SECURITY 17 /* get key security label */ +#define KEYCTL_SESSION_TO_PARENT 18 /* apply session keyring to parent process */ #endif /* _LINUX_KEYCTL_H */ diff --git a/include/linux/sched.h b/include/linux/sched.h index 5c7ce13c1696..9304027673b0 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1292,6 +1292,7 @@ struct task_struct { struct mutex cred_guard_mutex; /* guard against foreign influences on * credential calculations * (notably. ptrace) */ + struct cred *replacement_session_keyring; /* for KEYCTL_SESSION_TO_PARENT */ char comm[TASK_COMM_LEN]; /* executable name excluding path - access with [gs]et_task_comm (which lock diff --git a/include/linux/security.h b/include/linux/security.h index 40ba39ea68ce..97de3fe3dd0d 100644 --- a/include/linux/security.h +++ b/include/linux/security.h @@ -653,6 +653,11 @@ static inline void security_free_mnt_opts(struct security_mnt_opts *opts) * manual page for definitions of the @clone_flags. * @clone_flags contains the flags indicating what should be shared. * Return 0 if permission is granted. + * @cred_alloc_blank: + * @cred points to the credentials. + * @gfp indicates the atomicity of any memory allocations. + * Only allocate sufficient memory and attach to @cred such that + * cred_transfer() will not get ENOMEM. * @cred_free: * @cred points to the credentials. * Deallocate and clear the cred->security field in a set of credentials. @@ -665,6 +670,10 @@ static inline void security_free_mnt_opts(struct security_mnt_opts *opts) * @new points to the new credentials. * @old points to the original credentials. * Install a new set of credentials. + * @cred_transfer: + * @new points to the new credentials. + * @old points to the original credentials. + * Transfer data from original creds to new creds * @kernel_act_as: * Set the credentials for a kernel service to act as (subjective context). * @new points to the credentials to be modified. @@ -1103,6 +1112,13 @@ static inline void security_free_mnt_opts(struct security_mnt_opts *opts) * Return the length of the string (including terminating NUL) or -ve if * an error. * May also return 0 (and a NULL buffer pointer) if there is no label. + * @key_session_to_parent: + * Forcibly assign the session keyring from a process to its parent + * process. + * @cred: Pointer to process's credentials + * @parent_cred: Pointer to parent process's credentials + * @keyring: Proposed new session keyring + * Return 0 if permission is granted, -ve error otherwise. * * Security hooks affecting all System V IPC operations. * @@ -1498,10 +1514,12 @@ struct security_operations { int (*dentry_open) (struct file *file, const struct cred *cred); int (*task_create) (unsigned long clone_flags); + int (*cred_alloc_blank) (struct cred *cred, gfp_t gfp); void (*cred_free) (struct cred *cred); int (*cred_prepare)(struct cred *new, const struct cred *old, gfp_t gfp); void (*cred_commit)(struct cred *new, const struct cred *old); + void (*cred_transfer)(struct cred *new, const struct cred *old); int (*kernel_act_as)(struct cred *new, u32 secid); int (*kernel_create_files_as)(struct cred *new, struct inode *inode); int (*kernel_module_request)(void); @@ -1639,6 +1657,9 @@ struct security_operations { const struct cred *cred, key_perm_t perm); int (*key_getsecurity)(struct key *key, char **_buffer); + int (*key_session_to_parent)(const struct cred *cred, + const struct cred *parent_cred, + struct key *key); #endif /* CONFIG_KEYS */ #ifdef CONFIG_AUDIT @@ -1755,9 +1776,11 @@ int security_file_send_sigiotask(struct task_struct *tsk, int security_file_receive(struct file *file); int security_dentry_open(struct file *file, const struct cred *cred); int security_task_create(unsigned long clone_flags); +int security_cred_alloc_blank(struct cred *cred, gfp_t gfp); void security_cred_free(struct cred *cred); int security_prepare_creds(struct cred *new, const struct cred *old, gfp_t gfp); void security_commit_creds(struct cred *new, const struct cred *old); +void security_transfer_creds(struct cred *new, const struct cred *old); int security_kernel_act_as(struct cred *new, u32 secid); int security_kernel_create_files_as(struct cred *new, struct inode *inode); int security_kernel_module_request(void); @@ -2286,6 +2309,9 @@ static inline int security_task_create(unsigned long clone_flags) return 0; } +static inline void security_cred_alloc_blank(struct cred *cred, gfp_t gfp) +{ } + static inline void security_cred_free(struct cred *cred) { } @@ -2301,6 +2327,11 @@ static inline void security_commit_creds(struct cred *new, { } +static inline void security_transfer_creds(struct cred *new, + const struct cred *old) +{ +} + static inline int security_kernel_act_as(struct cred *cred, u32 secid) { return 0; @@ -2923,6 +2954,9 @@ void security_key_free(struct key *key); int security_key_permission(key_ref_t key_ref, const struct cred *cred, key_perm_t perm); int security_key_getsecurity(struct key *key, char **_buffer); +int security_key_session_to_parent(const struct cred *cred, + const struct cred *parent_cred, + struct key *key); #else @@ -2950,6 +2984,10 @@ static inline int security_key_getsecurity(struct key *key, char **_buffer) return 0; } +static inline int security_key_session_to_parent(const struct cred *cred, + const struct cred *parent_cred, + struct key *key); + #endif #endif /* CONFIG_KEYS */ diff --git a/kernel/cred.c b/kernel/cred.c index 24dd2f5104b1..006fcab009d5 100644 --- a/kernel/cred.c +++ b/kernel/cred.c @@ -199,6 +199,49 @@ void exit_creds(struct task_struct *tsk) validate_creds(cred); alter_cred_subscribers(cred, -1); put_cred(cred); + + cred = (struct cred *) tsk->replacement_session_keyring; + if (cred) { + tsk->replacement_session_keyring = NULL; + validate_creds(cred); + put_cred(cred); + } +} + +/* + * Allocate blank credentials, such that the credentials can be filled in at a + * later date without risk of ENOMEM. + */ +struct cred *cred_alloc_blank(void) +{ + struct cred *new; + + new = kmem_cache_zalloc(cred_jar, GFP_KERNEL); + if (!new) + return NULL; + +#ifdef CONFIG_KEYS + new->tgcred = kzalloc(sizeof(*new->tgcred), GFP_KERNEL); + if (!new->tgcred) { + kfree(new); + return NULL; + } + atomic_set(&new->tgcred->usage, 1); +#endif + + atomic_set(&new->usage, 1); + + if (security_cred_alloc_blank(new, GFP_KERNEL) < 0) + goto error; + +#ifdef CONFIG_DEBUG_CREDENTIALS + new->magic = CRED_MAGIC; +#endif + return new; + +error: + abort_creds(new); + return NULL; } /** diff --git a/security/capability.c b/security/capability.c index 06400cf07757..93a2ffe65905 100644 --- a/security/capability.c +++ b/security/capability.c @@ -373,6 +373,11 @@ static int cap_task_create(unsigned long clone_flags) return 0; } +static int cap_cred_alloc_blank(struct cred *cred, gfp_t gfp) +{ + return 0; +} + static void cap_cred_free(struct cred *cred) { } @@ -386,6 +391,10 @@ static void cap_cred_commit(struct cred *new, const struct cred *old) { } +static void cap_cred_transfer(struct cred *new, const struct cred *old) +{ +} + static int cap_kernel_act_as(struct cred *new, u32 secid) { return 0; @@ -836,6 +845,13 @@ static int cap_key_getsecurity(struct key *key, char **_buffer) return 0; } +static int cap_key_session_to_parent(const struct cred *cred, + const struct cred *parent_cred, + struct key *key) +{ + return 0; +} + #endif /* CONFIG_KEYS */ #ifdef CONFIG_AUDIT @@ -961,9 +977,11 @@ void security_fixup_ops(struct security_operations *ops) set_to_cap_if_null(ops, file_receive); set_to_cap_if_null(ops, dentry_open); set_to_cap_if_null(ops, task_create); + set_to_cap_if_null(ops, cred_alloc_blank); set_to_cap_if_null(ops, cred_free); set_to_cap_if_null(ops, cred_prepare); set_to_cap_if_null(ops, cred_commit); + set_to_cap_if_null(ops, cred_transfer); set_to_cap_if_null(ops, kernel_act_as); set_to_cap_if_null(ops, kernel_create_files_as); set_to_cap_if_null(ops, kernel_module_request); @@ -1063,6 +1081,7 @@ void security_fixup_ops(struct security_operations *ops) set_to_cap_if_null(ops, key_free); set_to_cap_if_null(ops, key_permission); set_to_cap_if_null(ops, key_getsecurity); + set_to_cap_if_null(ops, key_session_to_parent); #endif /* CONFIG_KEYS */ #ifdef CONFIG_AUDIT set_to_cap_if_null(ops, audit_rule_init); diff --git a/security/keys/compat.c b/security/keys/compat.c index c766c68a63bc..792c0a611a6d 100644 --- a/security/keys/compat.c +++ b/security/keys/compat.c @@ -82,6 +82,9 @@ asmlinkage long compat_sys_keyctl(u32 option, case KEYCTL_GET_SECURITY: return keyctl_get_security(arg2, compat_ptr(arg3), arg4); + case KEYCTL_SESSION_TO_PARENT: + return keyctl_session_to_parent(); + default: return -EOPNOTSUPP; } diff --git a/security/keys/gc.c b/security/keys/gc.c index 44adc325e15c..1e616aef55fd 100644 --- a/security/keys/gc.c +++ b/security/keys/gc.c @@ -65,6 +65,7 @@ static void key_gc_timer_func(unsigned long data) * - return true if we altered the keyring */ static bool key_gc_keyring(struct key *keyring, time_t limit) + __releases(key_serial_lock) { struct keyring_list *klist; struct key *key; diff --git a/security/keys/internal.h b/security/keys/internal.h index fb830514c337..24ba0307b7ad 100644 --- a/security/keys/internal.h +++ b/security/keys/internal.h @@ -201,6 +201,7 @@ extern long keyctl_set_timeout(key_serial_t, unsigned); extern long keyctl_assume_authority(key_serial_t); extern long keyctl_get_security(key_serial_t keyid, char __user *buffer, size_t buflen); +extern long keyctl_session_to_parent(void); /* * debugging key validation diff --git a/security/keys/keyctl.c b/security/keys/keyctl.c index 736d7800f97f..74c968524592 100644 --- a/security/keys/keyctl.c +++ b/security/keys/keyctl.c @@ -1228,6 +1228,105 @@ long keyctl_get_security(key_serial_t keyid, return ret; } +/* + * attempt to install the calling process's session keyring on the process's + * parent process + * - the keyring must exist and must grant us LINK permission + * - implements keyctl(KEYCTL_SESSION_TO_PARENT) + */ +long keyctl_session_to_parent(void) +{ + struct task_struct *me, *parent; + const struct cred *mycred, *pcred; + struct cred *cred, *oldcred; + key_ref_t keyring_r; + int ret; + + keyring_r = lookup_user_key(KEY_SPEC_SESSION_KEYRING, 0, KEY_LINK); + if (IS_ERR(keyring_r)) + return PTR_ERR(keyring_r); + + /* our parent is going to need a new cred struct, a new tgcred struct + * and new security data, so we allocate them here to prevent ENOMEM in + * our parent */ + ret = -ENOMEM; + cred = cred_alloc_blank(); + if (!cred) + goto error_keyring; + + cred->tgcred->session_keyring = key_ref_to_ptr(keyring_r); + keyring_r = NULL; + + me = current; + write_lock_irq(&tasklist_lock); + + parent = me->real_parent; + ret = -EPERM; + + /* the parent mustn't be init and mustn't be a kernel thread */ + if (parent->pid <= 1 || !parent->mm) + goto not_permitted; + + /* the parent must be single threaded */ + if (atomic_read(&parent->signal->count) != 1) + goto not_permitted; + + /* the parent and the child must have different session keyrings or + * there's no point */ + mycred = current_cred(); + pcred = __task_cred(parent); + if (mycred == pcred || + mycred->tgcred->session_keyring == pcred->tgcred->session_keyring) + goto already_same; + + /* the parent must have the same effective ownership and mustn't be + * SUID/SGID */ + if (pcred-> uid != mycred->euid || + pcred->euid != mycred->euid || + pcred->suid != mycred->euid || + pcred-> gid != mycred->egid || + pcred->egid != mycred->egid || + pcred->sgid != mycred->egid) + goto not_permitted; + + /* the keyrings must have the same UID */ + if (pcred ->tgcred->session_keyring->uid != mycred->euid || + mycred->tgcred->session_keyring->uid != mycred->euid) + goto not_permitted; + + /* the LSM must permit the replacement of the parent's keyring with the + * keyring from this process */ + ret = security_key_session_to_parent(mycred, pcred, + key_ref_to_ptr(keyring_r)); + if (ret < 0) + goto not_permitted; + + /* if there's an already pending keyring replacement, then we replace + * that */ + oldcred = parent->replacement_session_keyring; + + /* the replacement session keyring is applied just prior to userspace + * restarting */ + parent->replacement_session_keyring = cred; + cred = NULL; + set_ti_thread_flag(task_thread_info(parent), TIF_NOTIFY_RESUME); + + write_unlock_irq(&tasklist_lock); + if (oldcred) + put_cred(oldcred); + return 0; + +already_same: + ret = 0; +not_permitted: + put_cred(cred); + return ret; + +error_keyring: + key_ref_put(keyring_r); + return ret; +} + /*****************************************************************************/ /* * the key control system call @@ -1313,6 +1412,9 @@ SYSCALL_DEFINE5(keyctl, int, option, unsigned long, arg2, unsigned long, arg3, (char __user *) arg3, (size_t) arg4); + case KEYCTL_SESSION_TO_PARENT: + return keyctl_session_to_parent(); + default: return -EOPNOTSUPP; } diff --git a/security/keys/process_keys.c b/security/keys/process_keys.c index 4739cfbb41b7..5c23afb31ece 100644 --- a/security/keys/process_keys.c +++ b/security/keys/process_keys.c @@ -17,6 +17,7 @@ #include #include #include +#include #include #include #include "internal.h" @@ -768,3 +769,51 @@ error: abort_creds(new); return ret; } + +/* + * Replace a process's session keyring when that process resumes userspace on + * behalf of one of its children + */ +void key_replace_session_keyring(void) +{ + const struct cred *old; + struct cred *new; + + if (!current->replacement_session_keyring) + return; + + write_lock_irq(&tasklist_lock); + new = current->replacement_session_keyring; + current->replacement_session_keyring = NULL; + write_unlock_irq(&tasklist_lock); + + if (!new) + return; + + old = current_cred(); + new-> uid = old-> uid; + new-> euid = old-> euid; + new-> suid = old-> suid; + new->fsuid = old->fsuid; + new-> gid = old-> gid; + new-> egid = old-> egid; + new-> sgid = old-> sgid; + new->fsgid = old->fsgid; + new->user = get_uid(old->user); + new->group_info = get_group_info(old->group_info); + + new->securebits = old->securebits; + new->cap_inheritable = old->cap_inheritable; + new->cap_permitted = old->cap_permitted; + new->cap_effective = old->cap_effective; + new->cap_bset = old->cap_bset; + + new->jit_keyring = old->jit_keyring; + new->thread_keyring = key_get(old->thread_keyring); + new->tgcred->tgid = old->tgcred->tgid; + new->tgcred->process_keyring = key_get(old->tgcred->process_keyring); + + security_transfer_creds(new, old); + + commit_creds(new); +} diff --git a/security/security.c b/security/security.c index f88eaf6b14cc..d8b727637f02 100644 --- a/security/security.c +++ b/security/security.c @@ -684,6 +684,11 @@ int security_task_create(unsigned long clone_flags) return security_ops->task_create(clone_flags); } +int security_cred_alloc_blank(struct cred *cred, gfp_t gfp) +{ + return security_ops->cred_alloc_blank(cred, gfp); +} + void security_cred_free(struct cred *cred) { security_ops->cred_free(cred); @@ -699,6 +704,11 @@ void security_commit_creds(struct cred *new, const struct cred *old) security_ops->cred_commit(new, old); } +void security_transfer_creds(struct cred *new, const struct cred *old) +{ + security_ops->cred_transfer(new, old); +} + int security_kernel_act_as(struct cred *new, u32 secid) { return security_ops->kernel_act_as(new, secid); @@ -1241,6 +1251,13 @@ int security_key_getsecurity(struct key *key, char **_buffer) return security_ops->key_getsecurity(key, _buffer); } +int security_key_session_to_parent(const struct cred *cred, + const struct cred *parent_cred, + struct key *key) +{ + return security_ops->key_session_to_parent(cred, parent_cred, key); +} + #endif /* CONFIG_KEYS */ #ifdef CONFIG_AUDIT diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c index c3bb31ecc5aa..134a9c0d2004 100644 --- a/security/selinux/hooks.c +++ b/security/selinux/hooks.c @@ -3232,6 +3232,21 @@ static int selinux_task_create(unsigned long clone_flags) return current_has_perm(current, PROCESS__FORK); } +/* + * allocate the SELinux part of blank credentials + */ +static int selinux_cred_alloc_blank(struct cred *cred, gfp_t gfp) +{ + struct task_security_struct *tsec; + + tsec = kzalloc(sizeof(struct task_security_struct), gfp); + if (!tsec) + return -ENOMEM; + + cred->security = tsec; + return 0; +} + /* * detach and free the LSM part of a set of credentials */ @@ -3263,6 +3278,17 @@ static int selinux_cred_prepare(struct cred *new, const struct cred *old, return 0; } +/* + * transfer the SELinux data to a blank set of creds + */ +static void selinux_cred_transfer(struct cred *new, const struct cred *old) +{ + const struct task_security_struct *old_tsec = old->security; + struct task_security_struct *tsec = new->security; + + *tsec = *old_tsec; +} + /* * set the security data for a kernel service * - all the creation contexts are set to unlabelled @@ -5469,8 +5495,10 @@ static struct security_operations selinux_ops = { .dentry_open = selinux_dentry_open, .task_create = selinux_task_create, + .cred_alloc_blank = selinux_cred_alloc_blank, .cred_free = selinux_cred_free, .cred_prepare = selinux_cred_prepare, + .cred_transfer = selinux_cred_transfer, .kernel_act_as = selinux_kernel_act_as, .kernel_create_files_as = selinux_kernel_create_files_as, .kernel_module_request = selinux_kernel_module_request, diff --git a/security/smack/smack_lsm.c b/security/smack/smack_lsm.c index c243a2b25832..969f5fee1906 100644 --- a/security/smack/smack_lsm.c +++ b/security/smack/smack_lsm.c @@ -1079,6 +1079,22 @@ static int smack_file_receive(struct file *file) * Task hooks */ +/** + * smack_cred_alloc_blank - "allocate" blank task-level security credentials + * @new: the new credentials + * @gfp: the atomicity of any memory allocations + * + * Prepare a blank set of credentials for modification. This must allocate all + * the memory the LSM module might require such that cred_transfer() can + * complete without error. + */ +static int smack_cred_alloc_blank(struct cred *cred, gfp_t gfp) +{ + cred->security = NULL; + return 0; +} + + /** * smack_cred_free - "free" task-level security credentials * @cred: the credentials in question @@ -1116,6 +1132,18 @@ static void smack_cred_commit(struct cred *new, const struct cred *old) { } +/** + * smack_cred_transfer - Transfer the old credentials to the new credentials + * @new: the new credentials + * @old: the original credentials + * + * Fill in a set of blank credentials from another set of credentials. + */ +static void smack_cred_transfer(struct cred *new, const struct cred *old) +{ + new->security = old->security; +} + /** * smack_kernel_act_as - Set the subjective context in a set of credentials * @new: points to the set of credentials to be modified. @@ -3073,9 +3101,11 @@ struct security_operations smack_ops = { .file_send_sigiotask = smack_file_send_sigiotask, .file_receive = smack_file_receive, + .cred_alloc_blank = smack_cred_alloc_blank, .cred_free = smack_cred_free, .cred_prepare = smack_cred_prepare, .cred_commit = smack_cred_commit, + .cred_transfer = smack_cred_transfer, .kernel_act_as = smack_kernel_act_as, .kernel_create_files_as = smack_kernel_create_files_as, .task_setpgid = smack_task_setpgid, diff --git a/security/tomoyo/tomoyo.c b/security/tomoyo/tomoyo.c index 35a13e7915e4..9548a0984cc4 100644 --- a/security/tomoyo/tomoyo.c +++ b/security/tomoyo/tomoyo.c @@ -14,6 +14,12 @@ #include "tomoyo.h" #include "realpath.h" +static int tomoyo_cred_alloc_blank(struct cred *new, gfp_t gfp) +{ + new->security = NULL; + return 0; +} + static int tomoyo_cred_prepare(struct cred *new, const struct cred *old, gfp_t gfp) { @@ -25,6 +31,15 @@ static int tomoyo_cred_prepare(struct cred *new, const struct cred *old, return 0; } +static void tomoyo_cred_transfer(struct cred *new, const struct cred *old) +{ + /* + * Since "struct tomoyo_domain_info *" is a sharable pointer, + * we don't need to duplicate. + */ + new->security = old->security; +} + static int tomoyo_bprm_set_creds(struct linux_binprm *bprm) { int rc; @@ -262,7 +277,9 @@ static int tomoyo_dentry_open(struct file *f, const struct cred *cred) */ static struct security_operations tomoyo_security_ops = { .name = "tomoyo", + .cred_alloc_blank = tomoyo_cred_alloc_blank, .cred_prepare = tomoyo_cred_prepare, + .cred_transfer = tomoyo_cred_transfer, .bprm_set_creds = tomoyo_bprm_set_creds, .bprm_check_security = tomoyo_bprm_check_security, #ifdef CONFIG_SYSCTL -- cgit v1.2.3 From 733e5e4b4eb1bc1e27acbe092200154051171426 Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 9 Sep 2009 08:30:21 +0100 Subject: KEYS: Add missing linux/tracehook.h #inclusions Add #inclusions of linux/tracehook.h to those arch files that had the tracehook call for TIF_NOTIFY_RESUME added when support for that flag was added to that arch. Signed-off-by: David Howells Signed-off-by: James Morris --- arch/alpha/kernel/signal.c | 1 + arch/arm/kernel/signal.c | 1 + arch/avr32/kernel/signal.c | 1 + arch/cris/kernel/ptrace.c | 1 + arch/h8300/kernel/signal.c | 1 + arch/m32r/kernel/signal.c | 1 + arch/mips/kernel/signal.c | 1 + arch/parisc/kernel/signal.c | 1 + 8 files changed, 8 insertions(+) (limited to 'arch/arm/kernel') diff --git a/arch/alpha/kernel/signal.c b/arch/alpha/kernel/signal.c index d91aaa747050..0932dbb1ef8e 100644 --- a/arch/alpha/kernel/signal.c +++ b/arch/alpha/kernel/signal.c @@ -20,6 +20,7 @@ #include #include #include +#include #include #include diff --git a/arch/arm/kernel/signal.c b/arch/arm/kernel/signal.c index ea4ad3a43c88..b76fe06d92e7 100644 --- a/arch/arm/kernel/signal.c +++ b/arch/arm/kernel/signal.c @@ -12,6 +12,7 @@ #include #include #include +#include #include #include diff --git a/arch/avr32/kernel/signal.c b/arch/avr32/kernel/signal.c index de9f7fe2aefa..64f886fac2ef 100644 --- a/arch/avr32/kernel/signal.c +++ b/arch/avr32/kernel/signal.c @@ -16,6 +16,7 @@ #include #include #include +#include #include #include diff --git a/arch/cris/kernel/ptrace.c b/arch/cris/kernel/ptrace.c index 32e9d5ee895b..48b0f3912632 100644 --- a/arch/cris/kernel/ptrace.c +++ b/arch/cris/kernel/ptrace.c @@ -16,6 +16,7 @@ #include #include #include +#include #include #include diff --git a/arch/h8300/kernel/signal.c b/arch/h8300/kernel/signal.c index abac3ee8c52a..af842c369d24 100644 --- a/arch/h8300/kernel/signal.c +++ b/arch/h8300/kernel/signal.c @@ -39,6 +39,7 @@ #include #include #include +#include #include #include diff --git a/arch/m32r/kernel/signal.c b/arch/m32r/kernel/signal.c index f80bac17c65c..144b0f124fc7 100644 --- a/arch/m32r/kernel/signal.c +++ b/arch/m32r/kernel/signal.c @@ -21,6 +21,7 @@ #include #include #include +#include #include #include #include diff --git a/arch/mips/kernel/signal.c b/arch/mips/kernel/signal.c index c2acf31874a4..6254041b942f 100644 --- a/arch/mips/kernel/signal.c +++ b/arch/mips/kernel/signal.c @@ -21,6 +21,7 @@ #include #include #include +#include #include #include diff --git a/arch/parisc/kernel/signal.c b/arch/parisc/kernel/signal.c index 5ca1c02b805a..8eb3c63c407a 100644 --- a/arch/parisc/kernel/signal.c +++ b/arch/parisc/kernel/signal.c @@ -25,6 +25,7 @@ #include #include #include +#include #include #include #include -- cgit v1.2.3 From bc581770cfdd8c17ea17d324dc05e2f9c599e7ca Mon Sep 17 00:00:00 2001 From: Linus Walleij Date: Tue, 15 Sep 2009 17:30:37 +0100 Subject: ARM: 5580/2: ARM TCM (Tightly-Coupled Memory) support v3 This adds the TCM interface to Linux, when active, it will detect and report TCM memories and sizes early in boot if present, introduce generic TCM memory handling, provide a generic TCM memory pool and select TCM memory for the U300 platform. See the Documentation/arm/tcm.txt for documentation. Signed-off-by: Linus Walleij Signed-off-by: Russell King --- Documentation/arm/tcm.txt | 145 ++++++++++++++++++ arch/arm/Kconfig | 5 + arch/arm/include/asm/cputype.h | 5 + arch/arm/include/asm/tcm.h | 31 ++++ arch/arm/kernel/Makefile | 1 + arch/arm/kernel/setup.c | 2 + arch/arm/kernel/tcm.c | 246 +++++++++++++++++++++++++++++++ arch/arm/kernel/tcm.h | 17 +++ arch/arm/kernel/vmlinux.lds.S | 57 +++++++ arch/arm/mach-u300/include/mach/memory.h | 8 + arch/arm/mm/init.c | 8 + 11 files changed, 525 insertions(+) create mode 100644 Documentation/arm/tcm.txt create mode 100644 arch/arm/include/asm/tcm.h create mode 100644 arch/arm/kernel/tcm.c create mode 100644 arch/arm/kernel/tcm.h (limited to 'arch/arm/kernel') diff --git a/Documentation/arm/tcm.txt b/Documentation/arm/tcm.txt new file mode 100644 index 000000000000..074f4be6667f --- /dev/null +++ b/Documentation/arm/tcm.txt @@ -0,0 +1,145 @@ +ARM TCM (Tightly-Coupled Memory) handling in Linux +---- +Written by Linus Walleij + +Some ARM SoC:s have a so-called TCM (Tightly-Coupled Memory). +This is usually just a few (4-64) KiB of RAM inside the ARM +processor. + +Due to being embedded inside the CPU The TCM has a +Harvard-architecture, so there is an ITCM (instruction TCM) +and a DTCM (data TCM). The DTCM can not contain any +instructions, but the ITCM can actually contain data. +The size of DTCM or ITCM is minimum 4KiB so the typical +minimum configuration is 4KiB ITCM and 4KiB DTCM. + +ARM CPU:s have special registers to read out status, physical +location and size of TCM memories. arch/arm/include/asm/cputype.h +defines a CPUID_TCM register that you can read out from the +system control coprocessor. Documentation from ARM can be found +at http://infocenter.arm.com, search for "TCM Status Register" +to see documents for all CPUs. Reading this register you can +determine if ITCM (bit 0) and/or DTCM (bit 16) is present in the +machine. + +There is further a TCM region register (search for "TCM Region +Registers" at the ARM site) that can report and modify the location +size of TCM memories at runtime. This is used to read out and modify +TCM location and size. Notice that this is not a MMU table: you +actually move the physical location of the TCM around. At the +place you put it, it will mask any underlying RAM from the +CPU so it is usually wise not to overlap any physical RAM with +the TCM. The TCM memory exists totally outside the MMU and will +override any MMU mappings. + +Code executing inside the ITCM does not "see" any MMU mappings +and e.g. register accesses must be made to physical addresses. + +TCM is used for a few things: + +- FIQ and other interrupt handlers that need deterministic + timing and cannot wait for cache misses. + +- Idle loops where all external RAM is set to self-refresh + retention mode, so only on-chip RAM is accessible by + the CPU and then we hang inside ITCM waiting for an + interrupt. + +- Other operations which implies shutting off or reconfiguring + the external RAM controller. + +There is an interface for using TCM on the ARM architecture +in . Using this interface it is possible to: + +- Define the physical address and size of ITCM and DTCM. + +- Tag functions to be compiled into ITCM. + +- Tag data and constants to be allocated to DTCM and ITCM. + +- Have the remaining TCM RAM added to a special + allocation pool with gen_pool_create() and gen_pool_add() + and provice tcm_alloc() and tcm_free() for this + memory. Such a heap is great for things like saving + device state when shutting off device power domains. + +A machine that has TCM memory shall select HAVE_TCM in +arch/arm/Kconfig for itself, and then the +rest of the functionality will depend on the physical +location and size of ITCM and DTCM to be defined in +mach/memory.h for the machine. Code that needs to use +TCM shall #include If the TCM is not located +at the place given in memory.h it will be moved using +the TCM Region registers. + +Functions to go into itcm can be tagged like this: +int __tcmfunc foo(int bar); + +Variables to go into dtcm can be tagged like this: +int __tcmdata foo; + +Constants can be tagged like this: +int __tcmconst foo; + +To put assembler into TCM just use +.section ".tcm.text" or .section ".tcm.data" +respectively. + +Example code: + +#include + +/* Uninitialized data */ +static u32 __tcmdata tcmvar; +/* Initialized data */ +static u32 __tcmdata tcmassigned = 0x2BADBABEU; +/* Constant */ +static const u32 __tcmconst tcmconst = 0xCAFEBABEU; + +static void __tcmlocalfunc tcm_to_tcm(void) +{ + int i; + for (i = 0; i < 100; i++) + tcmvar ++; +} + +static void __tcmfunc hello_tcm(void) +{ + /* Some abstract code that runs in ITCM */ + int i; + for (i = 0; i < 100; i++) { + tcmvar ++; + } + tcm_to_tcm(); +} + +static void __init test_tcm(void) +{ + u32 *tcmem; + int i; + + hello_tcm(); + printk("Hello TCM executed from ITCM RAM\n"); + + printk("TCM variable from testrun: %u @ %p\n", tcmvar, &tcmvar); + tcmvar = 0xDEADBEEFU; + printk("TCM variable: 0x%x @ %p\n", tcmvar, &tcmvar); + + printk("TCM assigned variable: 0x%x @ %p\n", tcmassigned, &tcmassigned); + + printk("TCM constant: 0x%x @ %p\n", tcmconst, &tcmconst); + + /* Allocate some TCM memory from the pool */ + tcmem = tcm_alloc(20); + if (tcmem) { + printk("TCM Allocated 20 bytes of TCM @ %p\n", tcmem); + tcmem[0] = 0xDEADBEEFU; + tcmem[1] = 0x2BADBABEU; + tcmem[2] = 0xCAFEBABEU; + tcmem[3] = 0xDEADBEEFU; + tcmem[4] = 0x2BADBABEU; + for (i = 0; i < 5; i++) + printk("TCM tcmem[%d] = %08x\n", i, tcmem[i]); + tcm_free(tcmem, 20); + } +} diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig index d778a699f577..1c4119c60040 100644 --- a/arch/arm/Kconfig +++ b/arch/arm/Kconfig @@ -46,6 +46,10 @@ config GENERIC_CLOCKEVENTS_BROADCAST depends on GENERIC_CLOCKEVENTS default y if SMP && !LOCAL_TIMERS +config HAVE_TCM + bool + select GENERIC_ALLOCATOR + config NO_IOPORT bool @@ -649,6 +653,7 @@ config ARCH_U300 bool "ST-Ericsson U300 Series" depends on MMU select CPU_ARM926T + select HAVE_TCM select ARM_AMBA select ARM_VIC select GENERIC_TIME diff --git a/arch/arm/include/asm/cputype.h b/arch/arm/include/asm/cputype.h index b3e656c6fb78..54a1f1d76b14 100644 --- a/arch/arm/include/asm/cputype.h +++ b/arch/arm/include/asm/cputype.h @@ -63,6 +63,11 @@ static inline unsigned int __attribute_const__ read_cpuid_cachetype(void) return read_cpuid(CPUID_CACHETYPE); } +static inline unsigned int __attribute_const__ read_cpuid_tcmstatus(void) +{ + return read_cpuid(CPUID_TCM); +} + /* * Intel's XScale3 core supports some v6 features (supersections, L2) * but advertises itself as v5 as it does not support the v6 ISA. For diff --git a/arch/arm/include/asm/tcm.h b/arch/arm/include/asm/tcm.h new file mode 100644 index 000000000000..5929ef5d927a --- /dev/null +++ b/arch/arm/include/asm/tcm.h @@ -0,0 +1,31 @@ +/* + * + * Copyright (C) 2008-2009 ST-Ericsson AB + * License terms: GNU General Public License (GPL) version 2 + * + * Author: Rickard Andersson + * Author: Linus Walleij + * + */ +#ifndef __ASMARM_TCM_H +#define __ASMARM_TCM_H + +#ifndef CONFIG_HAVE_TCM +#error "You should not be including tcm.h unless you have a TCM!" +#endif + +#include + +/* Tag variables with this */ +#define __tcmdata __section(.tcm.data) +/* Tag constants with this */ +#define __tcmconst __section(.tcm.rodata) +/* Tag functions inside TCM called from outside TCM with this */ +#define __tcmfunc __attribute__((long_call)) __section(.tcm.text) noinline +/* Tag function inside TCM called from inside TCM with this */ +#define __tcmlocalfunc __section(.tcm.text) + +void *tcm_alloc(size_t len); +void tcm_free(void *addr, size_t len); + +#endif diff --git a/arch/arm/kernel/Makefile b/arch/arm/kernel/Makefile index 3213c9382b17..6808e01aedcd 100644 --- a/arch/arm/kernel/Makefile +++ b/arch/arm/kernel/Makefile @@ -34,6 +34,7 @@ obj-$(CONFIG_OABI_COMPAT) += sys_oabi-compat.o obj-$(CONFIG_ARM_THUMBEE) += thumbee.o obj-$(CONFIG_KGDB) += kgdb.o obj-$(CONFIG_ARM_UNWIND) += unwind.o +obj-$(CONFIG_HAVE_TCM) += tcm.o obj-$(CONFIG_CRUNCH) += crunch.o crunch-bits.o AFLAGS_crunch-bits.o := -Wa,-mcpu=ep9312 diff --git a/arch/arm/kernel/setup.c b/arch/arm/kernel/setup.c index d4d4f77c91b2..c6c57b640b6b 100644 --- a/arch/arm/kernel/setup.c +++ b/arch/arm/kernel/setup.c @@ -45,6 +45,7 @@ #include "compat.h" #include "atags.h" +#include "tcm.h" #ifndef MEM_SIZE #define MEM_SIZE (16*1024*1024) @@ -749,6 +750,7 @@ void __init setup_arch(char **cmdline_p) #endif cpu_init(); + tcm_init(); /* * Set up various architecture-specific pointers diff --git a/arch/arm/kernel/tcm.c b/arch/arm/kernel/tcm.c new file mode 100644 index 000000000000..e50303868f1b --- /dev/null +++ b/arch/arm/kernel/tcm.c @@ -0,0 +1,246 @@ +/* + * Copyright (C) 2008-2009 ST-Ericsson AB + * License terms: GNU General Public License (GPL) version 2 + * TCM memory handling for ARM systems + * + * Author: Linus Walleij + * Author: Rickard Andersson + */ +#include +#include +#include +#include +#include +#include +#include /* memcpy */ +#include /* PAGE_SHIFT */ +#include +#include +#include +#include "tcm.h" + +/* Scream and warn about misuse */ +#if !defined(ITCM_OFFSET) || !defined(ITCM_END) || \ + !defined(DTCM_OFFSET) || !defined(DTCM_END) +#error "TCM support selected but offsets not defined!" +#endif + +static struct gen_pool *tcm_pool; + +/* TCM section definitions from the linker */ +extern char __itcm_start, __sitcm_text, __eitcm_text; +extern char __dtcm_start, __sdtcm_data, __edtcm_data; + +/* + * TCM memory resources + */ +static struct resource dtcm_res = { + .name = "DTCM RAM", + .start = DTCM_OFFSET, + .end = DTCM_END, + .flags = IORESOURCE_MEM +}; + +static struct resource itcm_res = { + .name = "ITCM RAM", + .start = ITCM_OFFSET, + .end = ITCM_END, + .flags = IORESOURCE_MEM +}; + +static struct map_desc dtcm_iomap[] __initdata = { + { + .virtual = DTCM_OFFSET, + .pfn = __phys_to_pfn(DTCM_OFFSET), + .length = (DTCM_END - DTCM_OFFSET + 1), + .type = MT_UNCACHED + } +}; + +static struct map_desc itcm_iomap[] __initdata = { + { + .virtual = ITCM_OFFSET, + .pfn = __phys_to_pfn(ITCM_OFFSET), + .length = (ITCM_END - ITCM_OFFSET + 1), + .type = MT_UNCACHED + } +}; + +/* + * Allocate a chunk of TCM memory + */ +void *tcm_alloc(size_t len) +{ + unsigned long vaddr; + + if (!tcm_pool) + return NULL; + + vaddr = gen_pool_alloc(tcm_pool, len); + if (!vaddr) + return NULL; + + return (void *) vaddr; +} +EXPORT_SYMBOL(tcm_alloc); + +/* + * Free a chunk of TCM memory + */ +void tcm_free(void *addr, size_t len) +{ + gen_pool_free(tcm_pool, (unsigned long) addr, len); +} +EXPORT_SYMBOL(tcm_free); + + +static void __init setup_tcm_bank(u8 type, u32 offset, u32 expected_size) +{ + const int tcm_sizes[16] = { 0, -1, -1, 4, 8, 16, 32, 64, 128, + 256, 512, 1024, -1, -1, -1, -1 }; + u32 tcm_region; + int tcm_size; + + /* Read the special TCM region register c9, 0 */ + if (!type) + asm("mrc p15, 0, %0, c9, c1, 0" + : "=r" (tcm_region)); + else + asm("mrc p15, 0, %0, c9, c1, 1" + : "=r" (tcm_region)); + + tcm_size = tcm_sizes[(tcm_region >> 2) & 0x0f]; + if (tcm_size < 0) { + pr_err("CPU: %sTCM of unknown size!\n", + type ? "I" : "D"); + } else { + pr_info("CPU: found %sTCM %dk @ %08x, %senabled\n", + type ? "I" : "D", + tcm_size, + (tcm_region & 0xfffff000U), + (tcm_region & 1) ? "" : "not "); + } + + if (tcm_size != expected_size) { + pr_crit("CPU: %sTCM was detected %dk but expected %dk!\n", + type ? "I" : "D", + tcm_size, + expected_size); + /* Adjust to the expected size? what can we do... */ + } + + /* Force move the TCM bank to where we want it, enable */ + tcm_region = offset | (tcm_region & 0x00000ffeU) | 1; + + if (!type) + asm("mcr p15, 0, %0, c9, c1, 0" + : /* No output operands */ + : "r" (tcm_region)); + else + asm("mcr p15, 0, %0, c9, c1, 1" + : /* No output operands */ + : "r" (tcm_region)); + + pr_debug("CPU: moved %sTCM %dk to %08x, enabled\n", + type ? "I" : "D", + tcm_size, + (tcm_region & 0xfffff000U)); +} + +/* + * This initializes the TCM memory + */ +void __init tcm_init(void) +{ + u32 tcm_status = read_cpuid_tcmstatus(); + char *start; + char *end; + char *ram; + + /* Setup DTCM if present */ + if (tcm_status & (1 << 16)) { + setup_tcm_bank(0, DTCM_OFFSET, + (DTCM_END - DTCM_OFFSET + 1) >> 10); + request_resource(&iomem_resource, &dtcm_res); + iotable_init(dtcm_iomap, 1); + /* Copy data from RAM to DTCM */ + start = &__sdtcm_data; + end = &__edtcm_data; + ram = &__dtcm_start; + memcpy(start, ram, (end-start)); + pr_debug("CPU DTCM: copied data from %p - %p\n", start, end); + } + + /* Setup ITCM if present */ + if (tcm_status & 1) { + setup_tcm_bank(1, ITCM_OFFSET, + (ITCM_END - ITCM_OFFSET + 1) >> 10); + request_resource(&iomem_resource, &itcm_res); + iotable_init(itcm_iomap, 1); + /* Copy code from RAM to ITCM */ + start = &__sitcm_text; + end = &__eitcm_text; + ram = &__itcm_start; + memcpy(start, ram, (end-start)); + pr_debug("CPU ITCM: copied code from %p - %p\n", start, end); + } +} + +/* + * This creates the TCM memory pool and has to be done later, + * during the core_initicalls, since the allocator is not yet + * up and running when the first initialization runs. + */ +static int __init setup_tcm_pool(void) +{ + u32 tcm_status = read_cpuid_tcmstatus(); + u32 dtcm_pool_start = (u32) &__edtcm_data; + u32 itcm_pool_start = (u32) &__eitcm_text; + int ret; + + /* + * Set up malloc pool, 2^2 = 4 bytes granularity since + * the TCM is sometimes just 4 KiB. NB: pages and cache + * line alignments does not matter in TCM! + */ + tcm_pool = gen_pool_create(2, -1); + + pr_debug("Setting up TCM memory pool\n"); + + /* Add the rest of DTCM to the TCM pool */ + if (tcm_status & (1 << 16)) { + if (dtcm_pool_start < DTCM_END) { + ret = gen_pool_add(tcm_pool, dtcm_pool_start, + DTCM_END - dtcm_pool_start + 1, -1); + if (ret) { + pr_err("CPU DTCM: could not add DTCM " \ + "remainder to pool!\n"); + return ret; + } + pr_debug("CPU DTCM: Added %08x bytes @ %08x to " \ + "the TCM memory pool\n", + DTCM_END - dtcm_pool_start + 1, + dtcm_pool_start); + } + } + + /* Add the rest of ITCM to the TCM pool */ + if (tcm_status & 1) { + if (itcm_pool_start < ITCM_END) { + ret = gen_pool_add(tcm_pool, itcm_pool_start, + ITCM_END - itcm_pool_start + 1, -1); + if (ret) { + pr_err("CPU ITCM: could not add ITCM " \ + "remainder to pool!\n"); + return ret; + } + pr_debug("CPU ITCM: Added %08x bytes @ %08x to " \ + "the TCM memory pool\n", + ITCM_END - itcm_pool_start + 1, + itcm_pool_start); + } + } + return 0; +} + +core_initcall(setup_tcm_pool); diff --git a/arch/arm/kernel/tcm.h b/arch/arm/kernel/tcm.h new file mode 100644 index 000000000000..8015ad434a40 --- /dev/null +++ b/arch/arm/kernel/tcm.h @@ -0,0 +1,17 @@ +/* + * Copyright (C) 2008-2009 ST-Ericsson AB + * License terms: GNU General Public License (GPL) version 2 + * TCM memory handling for ARM systems + * + * Author: Linus Walleij + * Author: Rickard Andersson + */ + +#ifdef CONFIG_HAVE_TCM +void __init tcm_init(void); +#else +/* No TCM support, just blank inlines to be optimized out */ +inline void tcm_init(void) +{ +} +#endif diff --git a/arch/arm/kernel/vmlinux.lds.S b/arch/arm/kernel/vmlinux.lds.S index 69371028a202..39d3ffb9ff2b 100644 --- a/arch/arm/kernel/vmlinux.lds.S +++ b/arch/arm/kernel/vmlinux.lds.S @@ -198,6 +198,63 @@ SECTIONS } _edata_loc = __data_loc + SIZEOF(.data); +#ifdef CONFIG_HAVE_TCM + /* + * We align everything to a page boundary so we can + * free it after init has commenced and TCM contents have + * been copied to its destination. + */ + .tcm_start : { + . = ALIGN(PAGE_SIZE); + __tcm_start = .; + __itcm_start = .; + } + + /* + * Link these to the ITCM RAM + * Put VMA to the TCM address and LMA to the common RAM + * and we'll upload the contents from RAM to TCM and free + * the used RAM after that. + */ + .text_itcm ITCM_OFFSET : AT(__itcm_start) + { + __sitcm_text = .; + *(.tcm.text) + *(.tcm.rodata) + . = ALIGN(4); + __eitcm_text = .; + } + + /* + * Reset the dot pointer, this is needed to create the + * relative __dtcm_start below (to be used as extern in code). + */ + . = ADDR(.tcm_start) + SIZEOF(.tcm_start) + SIZEOF(.text_itcm); + + .dtcm_start : { + __dtcm_start = .; + } + + /* TODO: add remainder of ITCM as well, that can be used for data! */ + .data_dtcm DTCM_OFFSET : AT(__dtcm_start) + { + . = ALIGN(4); + __sdtcm_data = .; + *(.tcm.data) + . = ALIGN(4); + __edtcm_data = .; + } + + /* Reset the dot pointer or the linker gets confused */ + . = ADDR(.dtcm_start) + SIZEOF(.data_dtcm); + + /* End marker for freeing TCM copy in linked object */ + .tcm_end : AT(ADDR(.dtcm_start) + SIZEOF(.data_dtcm)){ + . = ALIGN(PAGE_SIZE); + __tcm_end = .; + } +#endif + .bss : { __bss_start = .; /* BSS */ *(.bss) diff --git a/arch/arm/mach-u300/include/mach/memory.h b/arch/arm/mach-u300/include/mach/memory.h index bf134bcc129d..ab000df7fc03 100644 --- a/arch/arm/mach-u300/include/mach/memory.h +++ b/arch/arm/mach-u300/include/mach/memory.h @@ -34,6 +34,14 @@ (CONFIG_MACH_U300_ACCESS_MEM_SIZE & 1))*1024*1024 + 0x100) #endif +/* + * TCM memory whereabouts + */ +#define ITCM_OFFSET 0xffff2000 +#define ITCM_END 0xffff3fff +#define DTCM_OFFSET 0xffff4000 +#define DTCM_END 0xffff5fff + /* * We enable a real big DMA buffer if need be. */ diff --git a/arch/arm/mm/init.c b/arch/arm/mm/init.c index ea36186f32c3..764d5dc9af76 100644 --- a/arch/arm/mm/init.c +++ b/arch/arm/mm/init.c @@ -613,6 +613,14 @@ void __init mem_init(void) void free_initmem(void) { +#ifdef CONFIG_HAVE_TCM + extern char *__tcm_start, *__tcm_end; + + totalram_pages += free_area(__phys_to_pfn(__pa(__tcm_start)), + __phys_to_pfn(__pa(__tcm_end)), + "TCM link"); +#endif + if (!machine_is_integrator() && !machine_is_cintegrator()) totalram_pages += free_area(__phys_to_pfn(__pa(__init_begin)), __phys_to_pfn(__pa(__init_end)), -- cgit v1.2.3 From 200b812d0084f800bc52465e273b118ff5f8141f Mon Sep 17 00:00:00 2001 From: Catalin Marinas Date: Fri, 18 Sep 2009 23:27:05 +0100 Subject: Clear the exclusive monitor when returning from an exception The patch adds a CLREX or dummy STREX to the exception return path. This is needed because several atomic/locking operations use a pair of LDREX/STREXEQ and the EQ condition may not always be satisfied. This would leave the exclusive monitor status set and may cause problems with atomic/locking operations in the interrupted code. With this patch, the atomic_set() operation can be a simple STR instruction (on SMP systems, the global exclusive monitor is cleared by STR anyway). Clearing the exclusive monitor during context switch is no longer needed as this is handled by the exception return path anyway. Signed-off-by: Catalin Marinas Reported-by: Jamie Lokier --- arch/arm/include/asm/atomic.h | 26 +++++++------------------- arch/arm/kernel/entry-armv.S | 7 ------- arch/arm/kernel/entry-header.S | 14 ++++++++++++++ 3 files changed, 21 insertions(+), 26 deletions(-) (limited to 'arch/arm/kernel') diff --git a/arch/arm/include/asm/atomic.h b/arch/arm/include/asm/atomic.h index 9ed2377fe8e5..d0daeab2234e 100644 --- a/arch/arm/include/asm/atomic.h +++ b/arch/arm/include/asm/atomic.h @@ -19,31 +19,21 @@ #ifdef __KERNEL__ +/* + * On ARM, ordinary assignment (str instruction) doesn't clear the local + * strex/ldrex monitor on some implementations. The reason we can use it for + * atomic_set() is the clrex or dummy strex done on every exception return. + */ #define atomic_read(v) ((v)->counter) +#define atomic_set(v,i) (((v)->counter) = (i)) #if __LINUX_ARM_ARCH__ >= 6 /* * ARMv6 UP and SMP safe atomic ops. We use load exclusive and * store exclusive to ensure that these are atomic. We may loop - * to ensure that the update happens. Writing to 'v->counter' - * without using the following operations WILL break the atomic - * nature of these ops. + * to ensure that the update happens. */ -static inline void atomic_set(atomic_t *v, int i) -{ - unsigned long tmp; - - __asm__ __volatile__("@ atomic_set\n" -"1: ldrex %0, [%1]\n" -" strex %0, %2, [%1]\n" -" teq %0, #0\n" -" bne 1b" - : "=&r" (tmp) - : "r" (&v->counter), "r" (i) - : "cc"); -} - static inline void atomic_add(int i, atomic_t *v) { unsigned long tmp; @@ -163,8 +153,6 @@ static inline void atomic_clear_mask(unsigned long mask, unsigned long *addr) #error SMP not supported on pre-ARMv6 CPUs #endif -#define atomic_set(v,i) (((v)->counter) = (i)) - static inline int atomic_add_return(int i, atomic_t *v) { unsigned long flags; diff --git a/arch/arm/kernel/entry-armv.S b/arch/arm/kernel/entry-armv.S index 3d727a8a23bc..a332bc7225bf 100644 --- a/arch/arm/kernel/entry-armv.S +++ b/arch/arm/kernel/entry-armv.S @@ -734,13 +734,6 @@ ENTRY(__switch_to) #ifdef CONFIG_MMU ldr r6, [r2, #TI_CPU_DOMAIN] #endif -#if __LINUX_ARM_ARCH__ >= 6 -#ifdef CONFIG_CPU_32v6K - clrex -#else - strex r5, r4, [ip] @ Clear exclusive monitor -#endif -#endif #if defined(CONFIG_HAS_TLS_REG) mcr p15, 0, r3, c13, c0, 3 @ set TLS register #elif !defined(CONFIG_TLS_REG_EMUL) diff --git a/arch/arm/kernel/entry-header.S b/arch/arm/kernel/entry-header.S index a4eaf4f920c5..e17e3c30d957 100644 --- a/arch/arm/kernel/entry-header.S +++ b/arch/arm/kernel/entry-header.S @@ -76,13 +76,25 @@ #ifndef CONFIG_THUMB2_KERNEL .macro svc_exit, rpsr msr spsr_cxsf, \rpsr +#if defined(CONFIG_CPU_32v6K) + clrex @ clear the exclusive monitor ldmia sp, {r0 - pc}^ @ load r0 - pc, cpsr +#elif defined (CONFIG_CPU_V6) + ldr r0, [sp] + strex r1, r2, [sp] @ clear the exclusive monitor + ldmib sp, {r1 - pc}^ @ load r1 - pc, cpsr +#endif .endm .macro restore_user_regs, fast = 0, offset = 0 ldr r1, [sp, #\offset + S_PSR] @ get calling cpsr ldr lr, [sp, #\offset + S_PC]! @ get pc msr spsr_cxsf, r1 @ save in spsr_svc +#if defined(CONFIG_CPU_32v6K) + clrex @ clear the exclusive monitor +#elif defined (CONFIG_CPU_V6) + strex r1, r2, [sp] @ clear the exclusive monitor +#endif .if \fast ldmdb sp, {r1 - lr}^ @ get calling r1 - lr .else @@ -98,6 +110,7 @@ .endm #else /* CONFIG_THUMB2_KERNEL */ .macro svc_exit, rpsr + clrex @ clear the exclusive monitor ldr r0, [sp, #S_SP] @ top of the stack ldr r1, [sp, #S_PC] @ return address tst r0, #4 @ orig stack 8-byte aligned? @@ -110,6 +123,7 @@ .endm .macro restore_user_regs, fast = 0, offset = 0 + clrex @ clear the exclusive monitor mov r2, sp load_user_sp_lr r2, r3, \offset + S_SP @ calling sp, lr ldr r1, [sp, #\offset + S_PSR] @ get calling cpsr -- cgit v1.2.3 From 83e686ea0291ee93b87dcdc00b96443b80de56c9 Mon Sep 17 00:00:00 2001 From: Catalin Marinas Date: Fri, 18 Sep 2009 23:27:07 +0100 Subject: Thumb-2: Correctly handle undefined instructions in the kernel VFP instructions in the kernel may trigger undefined exceptions if VFP hardware is not present. This patch corrects the loading of such Thumb-2 instructions. It also marks the "no_fp" label as a function so that the linker generate a Thumb address. Signed-off-by: Catalin Marinas --- arch/arm/kernel/entry-armv.S | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) (limited to 'arch/arm/kernel') diff --git a/arch/arm/kernel/entry-armv.S b/arch/arm/kernel/entry-armv.S index a332bc7225bf..0a2ba51cf35d 100644 --- a/arch/arm/kernel/entry-armv.S +++ b/arch/arm/kernel/entry-armv.S @@ -272,7 +272,15 @@ __und_svc: @ @ r0 - instruction @ +#ifndef CONFIG_THUMB2_KERNEL ldr r0, [r2, #-4] +#else + ldrh r0, [r2, #-2] @ Thumb instruction at LR - 2 + and r9, r0, #0xf800 + cmp r9, #0xe800 @ 32-bit instruction if xx >= 0 + ldrhhs r9, [r2] @ bottom 16 bits + orrhs r0, r9, r0, lsl #16 +#endif adr r9, BSYM(1f) bl call_fpe @@ -678,7 +686,9 @@ ENTRY(fp_enter) .word no_fp .previous -no_fp: mov pc, lr +ENTRY(no_fp) + mov pc, lr +ENDPROC(no_fp) __und_usr_unknown: enable_irq -- cgit v1.2.3 From 51b563fc93c8cb5bff1d67a0a71c374e4a4ea049 Mon Sep 17 00:00:00 2001 From: Sam Ravnborg Date: Sun, 20 Sep 2009 12:28:22 +0200 Subject: arm, cris, mips, sparc, powerpc, um, xtensa: fix build with bash 4.0 Albin Tonnerre reported: Bash 4 filters out variables which contain a dot in them. This happends to be the case of CPPFLAGS_vmlinux.lds. This is rather unfortunate, as it now causes build failures when using SHELL=/bin/bash to compile, or when bash happens to be used by make (eg when it's /bin/sh) Remove the common definition of CPPFLAGS_vmlinux.lds by pushing relevant stuff to either Makefile.build or the arch specific kernel/Makefile where we build the linker script. This is also nice cleanup as we move the information out where it is used. Notes for the different architectures touched: arm - we use an already exported symbol cris - we use a config symbol aleady available [Not build tested] mips - the jiffies complexity has moved to vmlinux.lds.S where we need it. Added a few variables to CPPFLAGS - they are only used by the linker script. [Not build tested] powerpc - removed assignment that is not needed [not build tested] sparc - simplified it using $(BITS) um - introduced a few new exported variables to deal with this xtensa - added options to CPP invocation [not build tested] Cc: Albin Tonnerre Cc: Russell King Cc: Mikael Starvik Cc: Jesper Nilsson Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: "David S. Miller" Cc: Jeff Dike Cc: Chris Zankel Signed-off-by: Sam Ravnborg --- Makefile | 5 ----- arch/arm/Makefile | 2 +- arch/arm/kernel/Makefile | 3 ++- arch/cris/Makefile | 2 -- arch/cris/kernel/Makefile | 1 + arch/mips/Makefile | 27 +++------------------------ arch/mips/kernel/vmlinux.lds.S | 13 +++++++++++-- arch/powerpc/Makefile | 2 -- arch/sparc/Makefile | 4 ---- arch/sparc/kernel/Makefile | 6 +++++- arch/um/Makefile | 9 ++++----- arch/um/kernel/Makefile | 3 +++ arch/um/kernel/vmlinux.lds.S | 3 +++ arch/xtensa/kernel/Makefile | 3 ++- scripts/Makefile.build | 3 ++- 15 files changed, 37 insertions(+), 49 deletions(-) (limited to 'arch/arm/kernel') diff --git a/Makefile b/Makefile index 5e1e74c9976c..f908accd332b 100644 --- a/Makefile +++ b/Makefile @@ -1027,11 +1027,6 @@ prepare0: archprepare FORCE # All the preparing.. prepare: prepare0 -# Leave this as default for preprocessing vmlinux.lds.S, which is now -# done in arch/$(ARCH)/kernel/Makefile - -export CPPFLAGS_vmlinux.lds += -P -C -U$(ARCH) - # The asm symlink changes when $(ARCH) changes. # Detect this and ask user to run make mrproper # If asm is a stale symlink (point to dir that does not exist) remove it diff --git a/arch/arm/Makefile b/arch/arm/Makefile index 68c6ab0749da..c695fdac5b30 100644 --- a/arch/arm/Makefile +++ b/arch/arm/Makefile @@ -14,7 +14,7 @@ LDFLAGS_vmlinux :=-p --no-undefined -X ifeq ($(CONFIG_CPU_ENDIAN_BE8),y) LDFLAGS_vmlinux += --be8 endif -CPPFLAGS_vmlinux.lds = -DTEXT_OFFSET=$(TEXT_OFFSET) + OBJCOPYFLAGS :=-O binary -R .note -R .note.gnu.build-id -R .comment -S GZFLAGS :=-9 #KBUILD_CFLAGS +=-pipe diff --git a/arch/arm/kernel/Makefile b/arch/arm/kernel/Makefile index 3213c9382b17..c446aeff7b89 100644 --- a/arch/arm/kernel/Makefile +++ b/arch/arm/kernel/Makefile @@ -2,7 +2,8 @@ # Makefile for the linux kernel. # -AFLAGS_head.o := -DTEXT_OFFSET=$(TEXT_OFFSET) +CPPFLAGS_vmlinux.lds := -DTEXT_OFFSET=$(TEXT_OFFSET) +AFLAGS_head.o := -DTEXT_OFFSET=$(TEXT_OFFSET) ifdef CONFIG_DYNAMIC_FTRACE CFLAGS_REMOVE_ftrace.o = -pg diff --git a/arch/cris/Makefile b/arch/cris/Makefile index 71e17d3eeddb..29c2ceb38a76 100644 --- a/arch/cris/Makefile +++ b/arch/cris/Makefile @@ -42,8 +42,6 @@ LD = $(CROSS_COMPILE)ld -mcrislinux OBJCOPYFLAGS := -O binary -R .note -R .comment -S -CPPFLAGS_vmlinux.lds = -DDRAM_VIRTUAL_BASE=0x$(CONFIG_ETRAX_DRAM_VIRTUAL_BASE) - KBUILD_AFLAGS += -mlinux -march=$(arch-y) $(inc) KBUILD_CFLAGS += -mlinux -march=$(arch-y) -pipe $(inc) KBUILD_CPPFLAGS += $(inc) diff --git a/arch/cris/kernel/Makefile b/arch/cris/kernel/Makefile index ee7bcd4d20b2..b45640b3e600 100644 --- a/arch/cris/kernel/Makefile +++ b/arch/cris/kernel/Makefile @@ -3,6 +3,7 @@ # Makefile for the linux kernel. # +CPPFLAGS_vmlinux.lds := -DDRAM_VIRTUAL_BASE=0x$(CONFIG_ETRAX_DRAM_VIRTUAL_BASE) extra-y := vmlinux.lds obj-y := process.o traps.o irq.o ptrace.o setup.o time.o sys_cris.o diff --git a/arch/mips/Makefile b/arch/mips/Makefile index c825b14b4ed0..77f5021218d3 100644 --- a/arch/mips/Makefile +++ b/arch/mips/Makefile @@ -627,16 +627,6 @@ endif cflags-y += -I$(srctree)/arch/mips/include/asm/mach-generic drivers-$(CONFIG_PCI) += arch/mips/pci/ -ifdef CONFIG_32BIT -ifdef CONFIG_CPU_LITTLE_ENDIAN -JIFFIES = jiffies_64 -else -JIFFIES = jiffies_64 + 4 -endif -else -JIFFIES = jiffies_64 -endif - # # Automatically detect the build format. By default we choose # the elf format according to the load address. @@ -660,8 +650,9 @@ ifdef CONFIG_64BIT endif KBUILD_AFLAGS += $(cflags-y) -KBUILD_CFLAGS += $(cflags-y) \ - -D"VMLINUX_LOAD_ADDRESS=$(load-y)" +KBUILD_CFLAGS += $(cflags-y) +KBUILD_CPPFLAGS += -D"VMLINUX_LOAD_ADDRESS=$(load-y)" +KBUILD_CPPFLAGS += -D"DATAOFFSET=$(if $(dataoffset-y),$(dataoffset-y),0)" LDFLAGS += -m $(ld-emul) @@ -676,18 +667,6 @@ endif OBJCOPYFLAGS += --remove-section=.reginfo -# -# Choosing incompatible machines durings configuration will result in -# error messages during linking. Select a default linkscript if -# none has been choosen above. -# - -CPPFLAGS_vmlinux.lds := \ - $(KBUILD_CFLAGS) \ - -D"LOADADDR=$(load-y)" \ - -D"JIFFIES=$(JIFFIES)" \ - -D"DATAOFFSET=$(if $(dataoffset-y),$(dataoffset-y),0)" - head-y := arch/mips/kernel/head.o arch/mips/kernel/init_task.o libs-y += arch/mips/lib/ diff --git a/arch/mips/kernel/vmlinux.lds.S b/arch/mips/kernel/vmlinux.lds.S index 2769bed3d2af..9bf0e3df7c5a 100644 --- a/arch/mips/kernel/vmlinux.lds.S +++ b/arch/mips/kernel/vmlinux.lds.S @@ -10,7 +10,16 @@ PHDRS { text PT_LOAD FLAGS(7); /* RWX */ note PT_NOTE FLAGS(4); /* R__ */ } -jiffies = JIFFIES; + +ifdef CONFIG_32BIT + ifdef CONFIG_CPU_LITTLE_ENDIAN + jiffies = jiffies_64; + else + jiffies = jiffies_64 + 4; + endif +else + jiffies = jiffies_64; +endif SECTIONS { @@ -29,7 +38,7 @@ SECTIONS /* . = 0xa800000000300000; */ . = 0xffffffff80300000; #endif - . = LOADADDR; + . = VMLINUX_LOAD_ADDRESS; /* read-only */ _text = .; /* Text and read-only data */ .text : { diff --git a/arch/powerpc/Makefile b/arch/powerpc/Makefile index 13e3fd852d63..aacf629c1a9f 100644 --- a/arch/powerpc/Makefile +++ b/arch/powerpc/Makefile @@ -158,8 +158,6 @@ drivers-$(CONFIG_OPROFILE) += arch/powerpc/oprofile/ # Default to zImage, override when needed all: zImage -CPPFLAGS_vmlinux.lds := -Upowerpc - BOOT_TARGETS = zImage zImage.initrd uImage zImage% dtbImage% treeImage.% cuImage.% simpleImage.% PHONY += $(BOOT_TARGETS) diff --git a/arch/sparc/Makefile b/arch/sparc/Makefile index 467221dd5702..dfe272d14465 100644 --- a/arch/sparc/Makefile +++ b/arch/sparc/Makefile @@ -31,7 +31,6 @@ export BITS := 32 #KBUILD_CFLAGS += -g -pipe -fcall-used-g5 -fcall-used-g7 KBUILD_CFLAGS += -m32 -pipe -mno-fpu -fcall-used-g5 -fcall-used-g7 KBUILD_AFLAGS += -m32 -CPPFLAGS_vmlinux.lds += -m32 #LDFLAGS_vmlinux = -N -Ttext 0xf0004000 # Since 2.5.40, the first stage is left not btfix-ed. @@ -45,9 +44,6 @@ else CHECKFLAGS += -D__sparc__ -D__sparc_v9__ -D__arch64__ -m64 -# Undefine sparc when processing vmlinux.lds - it is used -# And teach CPP we are doing 64 bit builds (for this case) -CPPFLAGS_vmlinux.lds += -m64 -Usparc LDFLAGS := -m elf64_sparc export BITS := 64 diff --git a/arch/sparc/kernel/Makefile b/arch/sparc/kernel/Makefile index 247cc620cee5..96aad394cd30 100644 --- a/arch/sparc/kernel/Makefile +++ b/arch/sparc/kernel/Makefile @@ -7,7 +7,11 @@ ccflags-y := -Werror extra-y := head_$(BITS).o extra-y += init_task.o -extra-y += vmlinux.lds + +# Undefine sparc when processing vmlinux.lds - it is used +# And teach CPP we are doing $(BITS) builds (for this case) +CPPFLAGS_vmlinux.lds := -Usparc -m$(BITS) +extra-y += vmlinux.lds obj-$(CONFIG_SPARC32) += entry.o wof.o wuf.o obj-$(CONFIG_SPARC32) += etrap_32.o diff --git a/arch/um/Makefile b/arch/um/Makefile index 0728def32234..fc633dbacf84 100644 --- a/arch/um/Makefile +++ b/arch/um/Makefile @@ -96,11 +96,10 @@ CFLAGS_NO_HARDENING := $(call cc-option, -fno-PIC,) $(call cc-option, -fno-pic,) $(call cc-option, -fno-stack-protector,) \ $(call cc-option, -fno-stack-protector-all,) -CONFIG_KERNEL_STACK_ORDER ?= 2 -STACK_SIZE := $(shell echo $$[ 4096 * (1 << $(CONFIG_KERNEL_STACK_ORDER)) ] ) - -CPPFLAGS_vmlinux.lds = -U$(SUBARCH) -DSTART=$(START) -DELF_ARCH=$(ELF_ARCH) \ - -DELF_FORMAT="$(ELF_FORMAT)" -DKERNEL_STACK_SIZE=$(STACK_SIZE) +# Options used by linker script +export LDS_START := $(START) +export LDS_ELF_ARCH := $(ELF_ARCH) +export LDS_ELF_FORMAT := $(ELF_FORMAT) # The wrappers will select whether using "malloc" or the kernel allocator. LINK_WRAPS = -Wl,--wrap,malloc -Wl,--wrap,free -Wl,--wrap,calloc diff --git a/arch/um/kernel/Makefile b/arch/um/kernel/Makefile index 388ec0a3ea9b..1119233597a1 100644 --- a/arch/um/kernel/Makefile +++ b/arch/um/kernel/Makefile @@ -3,6 +3,9 @@ # Licensed under the GPL # +CPPFLAGS_vmlinux.lds := -U$(SUBARCH) -DSTART=$(LDS_START) \ + -DELF_ARCH=$(LDS_ELF_ARCH) \ + -DELF_FORMAT=$(LDS_ELF_FORMAT) extra-y := vmlinux.lds clean-files := diff --git a/arch/um/kernel/vmlinux.lds.S b/arch/um/kernel/vmlinux.lds.S index f8aeb448aab6..16e49bfa2b42 100644 --- a/arch/um/kernel/vmlinux.lds.S +++ b/arch/um/kernel/vmlinux.lds.S @@ -1,3 +1,6 @@ + +KERNEL_STACK_SIZE = 4096 * (1 << CONFIG_KERNEL_STACK_ORDER); + #ifdef CONFIG_LD_SCRIPT_STATIC #include "uml.lds.S" #else diff --git a/arch/xtensa/kernel/Makefile b/arch/xtensa/kernel/Makefile index fe3186de6a33..6f56d95f2c1e 100644 --- a/arch/xtensa/kernel/Makefile +++ b/arch/xtensa/kernel/Makefile @@ -27,7 +27,8 @@ sed-y = -e 's/(\(\.[a-z]*it\|\.ref\|\)\.text)/(\1.literal \1.text)/g' \ -e 's/(\(\.text\.[a-z]*\))/(\1.literal \1)/g' quiet_cmd__cpp_lds_S = LDS $@ - cmd__cpp_lds_S = $(CPP) $(cpp_flags) -D__ASSEMBLY__ $< | sed $(sed-y) >$@ + cmd__cpp_lds_S = $(CPP) $(cpp_flags) -P -C -Uxtensa -D__ASSEMBLY__ $< \ + | sed $(sed-y) >$@ $(obj)/vmlinux.lds: $(src)/vmlinux.lds.S FORCE $(call if_changed_dep,_cpp_lds_S) diff --git a/scripts/Makefile.build b/scripts/Makefile.build index 6e41b3b6b154..d5425660a3df 100644 --- a/scripts/Makefile.build +++ b/scripts/Makefile.build @@ -270,7 +270,8 @@ targets += $(extra-y) $(MAKECMDGOALS) $(always) # Linker scripts preprocessor (.lds.S -> .lds) # --------------------------------------------------------------------------- quiet_cmd_cpp_lds_S = LDS $@ - cmd_cpp_lds_S = $(CPP) $(cpp_flags) -D__ASSEMBLY__ -o $@ $< + cmd_cpp_lds_S = $(CPP) $(cpp_flags) -P -C -U$(ARCH) \ + -D__ASSEMBLY__ -o $@ $< $(obj)/%.lds: $(src)/%.lds.S FORCE $(call if_changed_dep,cpp_lds_S) -- cgit v1.2.3 From d200c922bc2b1ac88b8d33b6cfff2ed837af186a Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Sun, 20 Sep 2009 18:14:13 -0400 Subject: Use new __init_task_data macro in arch init_task.c files. Signed-off-by: Joe Perches Signed-off-by: Tim Abbott Acked-by: Paul Mundt Signed-off-by: Sam Ravnborg --- arch/arm/kernel/init_task.c | 5 ++--- arch/avr32/kernel/init_task.c | 5 ++--- arch/cris/kernel/process.c | 5 ++--- arch/frv/kernel/init_task.c | 5 ++--- arch/h8300/kernel/init_task.c | 5 ++--- arch/ia64/kernel/init_task.c | 3 ++- arch/m32r/kernel/init_task.c | 5 ++--- arch/m68k/kernel/process.c | 6 +++--- arch/m68knommu/kernel/init_task.c | 5 ++--- arch/microblaze/kernel/init_task.c | 5 ++--- arch/mips/kernel/init_task.c | 5 ++--- arch/mn10300/kernel/init_task.c | 5 ++--- arch/parisc/kernel/init_task.c | 4 ++-- arch/powerpc/kernel/init_task.c | 5 ++--- arch/powerpc/kernel/machine_kexec_64.c | 5 +++-- arch/s390/kernel/init_task.c | 5 ++--- arch/score/kernel/init_task.c | 5 ++--- arch/sh/kernel/init_task.c | 5 ++--- arch/sparc/kernel/init_task.c | 5 ++--- arch/um/kernel/init_task.c | 5 ++--- arch/x86/kernel/init_task.c | 5 ++--- arch/xtensa/kernel/init_task.c | 5 ++--- 22 files changed, 46 insertions(+), 62 deletions(-) (limited to 'arch/arm/kernel') diff --git a/arch/arm/kernel/init_task.c b/arch/arm/kernel/init_task.c index 3f470866bb89..e7cbb50dc356 100644 --- a/arch/arm/kernel/init_task.c +++ b/arch/arm/kernel/init_task.c @@ -24,9 +24,8 @@ static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand); * * The things we do for performance.. */ -union thread_union init_thread_union - __attribute__((__section__(".data.init_task"))) = - { INIT_THREAD_INFO(init_task) }; +union thread_union init_thread_union __init_task_data = + { INIT_THREAD_INFO(init_task) }; /* * Initial task structure. diff --git a/arch/avr32/kernel/init_task.c b/arch/avr32/kernel/init_task.c index 57ec9f2dcd95..6b2343e6fe33 100644 --- a/arch/avr32/kernel/init_task.c +++ b/arch/avr32/kernel/init_task.c @@ -18,9 +18,8 @@ static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand); /* * Initial thread structure. Must be aligned on an 8192-byte boundary. */ -union thread_union init_thread_union - __attribute__((__section__(".data.init_task"))) = - { INIT_THREAD_INFO(init_task) }; +union thread_union init_thread_union __init_task_data = + { INIT_THREAD_INFO(init_task) }; /* * Initial task structure. diff --git a/arch/cris/kernel/process.c b/arch/cris/kernel/process.c index 51dcd04d2777..c99aeab7cef7 100644 --- a/arch/cris/kernel/process.c +++ b/arch/cris/kernel/process.c @@ -45,9 +45,8 @@ static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand); * way process stacks are handled. This is done by having a special * "init_task" linker map entry.. */ -union thread_union init_thread_union - __attribute__((__section__(".data.init_task"))) = - { INIT_THREAD_INFO(init_task) }; +union thread_union init_thread_union __init_task_data = + { INIT_THREAD_INFO(init_task) }; /* * Initial task structure. diff --git a/arch/frv/kernel/init_task.c b/arch/frv/kernel/init_task.c index 1d3df1d9495c..3c3e0b336a9d 100644 --- a/arch/frv/kernel/init_task.c +++ b/arch/frv/kernel/init_task.c @@ -19,9 +19,8 @@ static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand); * way process stacks are handled. This is done by having a special * "init_task" linker map entry.. */ -union thread_union init_thread_union - __attribute__((__section__(".data.init_task"))) = - { INIT_THREAD_INFO(init_task) }; +union thread_union init_thread_union __init_task_data = + { INIT_THREAD_INFO(init_task) }; /* * Initial task structure. diff --git a/arch/h8300/kernel/init_task.c b/arch/h8300/kernel/init_task.c index 089c65ed6eb3..54c1062ee80e 100644 --- a/arch/h8300/kernel/init_task.c +++ b/arch/h8300/kernel/init_task.c @@ -31,7 +31,6 @@ EXPORT_SYMBOL(init_task); * way process stacks are handled. This is done by having a special * "init_task" linker map entry.. */ -union thread_union init_thread_union - __attribute__((__section__(".data.init_task"))) = - { INIT_THREAD_INFO(init_task) }; +union thread_union init_thread_union __init_task_data = + { INIT_THREAD_INFO(init_task) }; diff --git a/arch/ia64/kernel/init_task.c b/arch/ia64/kernel/init_task.c index c475fc281be7..e253ab8fcbc8 100644 --- a/arch/ia64/kernel/init_task.c +++ b/arch/ia64/kernel/init_task.c @@ -33,7 +33,8 @@ union { struct thread_info thread_info; } s; unsigned long stack[KERNEL_STACK_SIZE/sizeof (unsigned long)]; -} init_task_mem asm ("init_task") __attribute__((section(".data.init_task"))) = {{ +} init_task_mem asm ("init_task") __init_task_data = + {{ .task = INIT_TASK(init_task_mem.s.task), .thread_info = INIT_THREAD_INFO(init_task_mem.s.task) }}; diff --git a/arch/m32r/kernel/init_task.c b/arch/m32r/kernel/init_task.c index fce57e5d3f91..6c42d5f8df50 100644 --- a/arch/m32r/kernel/init_task.c +++ b/arch/m32r/kernel/init_task.c @@ -20,9 +20,8 @@ static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand); * way process stacks are handled. This is done by having a special * "init_task" linker map entry.. */ -union thread_union init_thread_union - __attribute__((__section__(".data.init_task"))) = - { INIT_THREAD_INFO(init_task) }; +union thread_union init_thread_union __init_task_data = + { INIT_THREAD_INFO(init_task) }; /* * Initial task structure. diff --git a/arch/m68k/kernel/process.c b/arch/m68k/kernel/process.c index 72bad65dba3a..41230c595a8e 100644 --- a/arch/m68k/kernel/process.c +++ b/arch/m68k/kernel/process.c @@ -42,9 +42,9 @@ */ static struct signal_struct init_signals = INIT_SIGNALS(init_signals); static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand); -union thread_union init_thread_union -__attribute__((section(".data.init_task"), aligned(THREAD_SIZE))) - = { INIT_THREAD_INFO(init_task) }; +union thread_union init_thread_union __init_task_data + __attribute__((aligned(THREAD_SIZE))) = + { INIT_THREAD_INFO(init_task) }; /* initial task structure */ struct task_struct init_task = INIT_TASK(init_task); diff --git a/arch/m68knommu/kernel/init_task.c b/arch/m68knommu/kernel/init_task.c index 45e97a207fed..cbf9dc3cc51d 100644 --- a/arch/m68knommu/kernel/init_task.c +++ b/arch/m68knommu/kernel/init_task.c @@ -31,7 +31,6 @@ EXPORT_SYMBOL(init_task); * way process stacks are handled. This is done by having a special * "init_task" linker map entry.. */ -union thread_union init_thread_union - __attribute__((__section__(".data.init_task"))) = - { INIT_THREAD_INFO(init_task) }; +union thread_union init_thread_union __init_task_data = + { INIT_THREAD_INFO(init_task) }; diff --git a/arch/microblaze/kernel/init_task.c b/arch/microblaze/kernel/init_task.c index 67da22579b62..b5d711f94ff8 100644 --- a/arch/microblaze/kernel/init_task.c +++ b/arch/microblaze/kernel/init_task.c @@ -19,9 +19,8 @@ static struct signal_struct init_signals = INIT_SIGNALS(init_signals); static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand); -union thread_union init_thread_union - __attribute__((__section__(".data.init_task"))) = -{ INIT_THREAD_INFO(init_task) }; +union thread_union init_thread_union __init_task_data = + { INIT_THREAD_INFO(init_task) }; struct task_struct init_task = INIT_TASK(init_task); EXPORT_SYMBOL(init_task); diff --git a/arch/mips/kernel/init_task.c b/arch/mips/kernel/init_task.c index 5b457a40c784..6d6ca5305895 100644 --- a/arch/mips/kernel/init_task.c +++ b/arch/mips/kernel/init_task.c @@ -21,9 +21,8 @@ static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand); * * The things we do for performance.. */ -union thread_union init_thread_union - __attribute__((__section__(".data.init_task"), - __aligned__(THREAD_SIZE))) = +union thread_union init_thread_union __init_task_data + __attribute__((__aligned__(THREAD_SIZE))) = { INIT_THREAD_INFO(init_task) }; /* diff --git a/arch/mn10300/kernel/init_task.c b/arch/mn10300/kernel/init_task.c index 80d423b80af3..a481b043bea7 100644 --- a/arch/mn10300/kernel/init_task.c +++ b/arch/mn10300/kernel/init_task.c @@ -27,9 +27,8 @@ static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand); * way process stacks are handled. This is done by having a special * "init_task" linker map entry.. */ -union thread_union init_thread_union - __attribute__((__section__(".data.init_task"))) = - { INIT_THREAD_INFO(init_task) }; +union thread_union init_thread_union __init_task_data = + { INIT_THREAD_INFO(init_task) }; /* * Initial task structure. diff --git a/arch/parisc/kernel/init_task.c b/arch/parisc/kernel/init_task.c index 82974b20fc10..d020eae6525c 100644 --- a/arch/parisc/kernel/init_task.c +++ b/arch/parisc/kernel/init_task.c @@ -43,8 +43,8 @@ static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand); * way process stacks are handled. This is done by having a special * "init_task" linker map entry.. */ -union thread_union init_thread_union - __attribute__((aligned(128))) __attribute__((__section__(".data.init_task"))) = +union thread_union init_thread_union __init_task_data + __attribute__((aligned(128))) = { INIT_THREAD_INFO(init_task) }; #if PT_NLEVELS == 3 diff --git a/arch/powerpc/kernel/init_task.c b/arch/powerpc/kernel/init_task.c index ffc4253fef55..2375b7eb1c76 100644 --- a/arch/powerpc/kernel/init_task.c +++ b/arch/powerpc/kernel/init_task.c @@ -16,9 +16,8 @@ static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand); * way process stacks are handled. This is done by having a special * "init_task" linker map entry.. */ -union thread_union init_thread_union - __attribute__((__section__(".data.init_task"))) = - { INIT_THREAD_INFO(init_task) }; +union thread_union init_thread_union __init_task_data = + { INIT_THREAD_INFO(init_task) }; /* * Initial task structure. diff --git a/arch/powerpc/kernel/machine_kexec_64.c b/arch/powerpc/kernel/machine_kexec_64.c index 49e705fcee6d..040bd1de8d99 100644 --- a/arch/powerpc/kernel/machine_kexec_64.c +++ b/arch/powerpc/kernel/machine_kexec_64.c @@ -13,6 +13,7 @@ #include #include #include +#include #include #include @@ -249,8 +250,8 @@ static void kexec_prepare_cpus(void) * We could use a smaller stack if we don't care about anything using * current, but that audit has not been performed. */ -static union thread_union kexec_stack - __attribute__((__section__(".data.init_task"))) = { }; +static union thread_union kexec_stack __init_task_data = + { }; /* Our assembly helper, in kexec_stub.S */ extern NORET_TYPE void kexec_sequence(void *newstack, unsigned long start, diff --git a/arch/s390/kernel/init_task.c b/arch/s390/kernel/init_task.c index fe787f9e5f3f..4d1c9fb0b540 100644 --- a/arch/s390/kernel/init_task.c +++ b/arch/s390/kernel/init_task.c @@ -25,9 +25,8 @@ static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand); * way process stacks are handled. This is done by having a special * "init_task" linker map entry.. */ -union thread_union init_thread_union - __attribute__((__section__(".data.init_task"))) = - { INIT_THREAD_INFO(init_task) }; +union thread_union init_thread_union __init_task_data = + { INIT_THREAD_INFO(init_task) }; /* * Initial task structure. diff --git a/arch/score/kernel/init_task.c b/arch/score/kernel/init_task.c index ff952f6c63fd..baa03ee217d1 100644 --- a/arch/score/kernel/init_task.c +++ b/arch/score/kernel/init_task.c @@ -34,9 +34,8 @@ static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand); * way process stacks are handled. This is done by having a special * "init_task" linker map entry.. */ -union thread_union init_thread_union - __attribute__((__section__(".data.init_task"), __aligned__(THREAD_SIZE))) = - { INIT_THREAD_INFO(init_task) }; +union thread_union init_thread_union __init_task_data = + { INIT_THREAD_INFO(init_task) }; /* * Initial task structure. diff --git a/arch/sh/kernel/init_task.c b/arch/sh/kernel/init_task.c index 1719957c0a69..11f2ea556a6b 100644 --- a/arch/sh/kernel/init_task.c +++ b/arch/sh/kernel/init_task.c @@ -17,9 +17,8 @@ struct pt_regs fake_swapper_regs; * way process stacks are handled. This is done by having a special * "init_task" linker map entry.. */ -union thread_union init_thread_union - __attribute__((__section__(".data.init_task"))) = - { INIT_THREAD_INFO(init_task) }; +union thread_union init_thread_union __init_task_data = + { INIT_THREAD_INFO(init_task) }; /* * Initial task structure. diff --git a/arch/sparc/kernel/init_task.c b/arch/sparc/kernel/init_task.c index 28125c5b3d3c..5fe3d65581f7 100644 --- a/arch/sparc/kernel/init_task.c +++ b/arch/sparc/kernel/init_task.c @@ -18,6 +18,5 @@ EXPORT_SYMBOL(init_task); * If this is not aligned on a 8k boundry, then you should change code * in etrap.S which assumes it. */ -union thread_union init_thread_union - __attribute__((section (".data.init_task"))) - = { INIT_THREAD_INFO(init_task) }; +union thread_union init_thread_union __init_task_data = + { INIT_THREAD_INFO(init_task) }; diff --git a/arch/um/kernel/init_task.c b/arch/um/kernel/init_task.c index b25121b537d8..8aa77b61a5ff 100644 --- a/arch/um/kernel/init_task.c +++ b/arch/um/kernel/init_task.c @@ -30,9 +30,8 @@ EXPORT_SYMBOL(init_task); * "init_task" linker map entry.. */ -union thread_union init_thread_union - __attribute__((__section__(".data.init_task"))) = - { INIT_THREAD_INFO(init_task) }; +union thread_union init_thread_union __init_task_data = + { INIT_THREAD_INFO(init_task) }; union thread_union cpu0_irqstack __attribute__((__section__(".data.init_irqstack"))) = diff --git a/arch/x86/kernel/init_task.c b/arch/x86/kernel/init_task.c index 270ff83efc11..3a54dcb9cd0e 100644 --- a/arch/x86/kernel/init_task.c +++ b/arch/x86/kernel/init_task.c @@ -20,9 +20,8 @@ static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand); * way process stacks are handled. This is done by having a special * "init_task" linker map entry.. */ -union thread_union init_thread_union - __attribute__((__section__(".data.init_task"))) = - { INIT_THREAD_INFO(init_task) }; +union thread_union init_thread_union __init_task_data = + { INIT_THREAD_INFO(init_task) }; /* * Initial task structure. diff --git a/arch/xtensa/kernel/init_task.c b/arch/xtensa/kernel/init_task.c index c4302f0e4ba0..cd122fb7e48a 100644 --- a/arch/xtensa/kernel/init_task.c +++ b/arch/xtensa/kernel/init_task.c @@ -23,9 +23,8 @@ static struct signal_struct init_signals = INIT_SIGNALS(init_signals); static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand); -union thread_union init_thread_union - __attribute__((__section__(".data.init_task"))) = -{ INIT_THREAD_INFO(init_task) }; +union thread_union init_thread_union __init_task_data = + { INIT_THREAD_INFO(init_task) }; struct task_struct init_task = INIT_TASK(init_task); -- cgit v1.2.3 From cdd6c482c9ff9c55475ee7392ec8f672eddb7be6 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 21 Sep 2009 12:02:48 +0200 Subject: perf: Do the big rename: Performance Counters -> Performance Events Bye-bye Performance Counters, welcome Performance Events! In the past few months the perfcounters subsystem has grown out its initial role of counting hardware events, and has become (and is becoming) a much broader generic event enumeration, reporting, logging, monitoring, analysis facility. Naming its core object 'perf_counter' and naming the subsystem 'perfcounters' has become more and more of a misnomer. With pending code like hw-breakpoints support the 'counter' name is less and less appropriate. All in one, we've decided to rename the subsystem to 'performance events' and to propagate this rename through all fields, variables and API names. (in an ABI compatible fashion) The word 'event' is also a bit shorter than 'counter' - which makes it slightly more convenient to write/handle as well. Thanks goes to Stephane Eranian who first observed this misnomer and suggested a rename. User-space tooling and ABI compatibility is not affected - this patch should be function-invariant. (Also, defconfigs were not touched to keep the size down.) This patch has been generated via the following script: FILES=$(find * -type f | grep -vE 'oprofile|[^K]config') sed -i \ -e 's/PERF_EVENT_/PERF_RECORD_/g' \ -e 's/PERF_COUNTER/PERF_EVENT/g' \ -e 's/perf_counter/perf_event/g' \ -e 's/nb_counters/nb_events/g' \ -e 's/swcounter/swevent/g' \ -e 's/tpcounter_event/tp_event/g' \ $FILES for N in $(find . -name perf_counter.[ch]); do M=$(echo $N | sed 's/perf_counter/perf_event/g') mv $N $M done FILES=$(find . -name perf_event.*) sed -i \ -e 's/COUNTER_MASK/REG_MASK/g' \ -e 's/COUNTER/EVENT/g' \ -e 's/\/event_id/g' \ -e 's/counter/event/g' \ -e 's/Counter/Event/g' \ $FILES ... to keep it as correct as possible. This script can also be used by anyone who has pending perfcounters patches - it converts a Linux kernel tree over to the new naming. We tried to time this change to the point in time where the amount of pending patches is the smallest: the end of the merge window. Namespace clashes were fixed up in a preparatory patch - and some stylistic fallout will be fixed up in a subsequent patch. ( NOTE: 'counters' are still the proper terminology when we deal with hardware registers - and these sed scripts are a bit over-eager in renaming them. I've undone some of that, but in case there's something left where 'counter' would be better than 'event' we can undo that on an individual basis instead of touching an otherwise nicely automated patch. ) Suggested-by: Stephane Eranian Acked-by: Peter Zijlstra Acked-by: Paul Mackerras Reviewed-by: Arjan van de Ven Cc: Mike Galbraith Cc: Arnaldo Carvalho de Melo Cc: Frederic Weisbecker Cc: Steven Rostedt Cc: Benjamin Herrenschmidt Cc: David Howells Cc: Kyle McMartin Cc: Martin Schwidefsky Cc: "David S. Miller" Cc: Thomas Gleixner Cc: "H. Peter Anvin" Cc: LKML-Reference: Signed-off-by: Ingo Molnar --- arch/arm/include/asm/unistd.h | 2 +- arch/arm/kernel/calls.S | 2 +- arch/blackfin/include/asm/unistd.h | 2 +- arch/blackfin/mach-common/entry.S | 2 +- arch/frv/Kconfig | 2 +- arch/frv/include/asm/perf_counter.h | 17 - arch/frv/include/asm/perf_event.h | 17 + arch/frv/include/asm/unistd.h | 2 +- arch/frv/kernel/entry.S | 2 +- arch/frv/lib/Makefile | 2 +- arch/frv/lib/perf_counter.c | 19 - arch/frv/lib/perf_event.c | 19 + arch/m68k/include/asm/unistd.h | 2 +- arch/m68k/kernel/entry.S | 2 +- arch/m68knommu/kernel/syscalltable.S | 2 +- arch/microblaze/include/asm/unistd.h | 2 +- arch/microblaze/kernel/syscall_table.S | 2 +- arch/mips/include/asm/unistd.h | 6 +- arch/mips/kernel/scall32-o32.S | 2 +- arch/mips/kernel/scall64-64.S | 2 +- arch/mips/kernel/scall64-n32.S | 2 +- arch/mips/kernel/scall64-o32.S | 2 +- arch/mn10300/include/asm/unistd.h | 2 +- arch/mn10300/kernel/entry.S | 2 +- arch/parisc/Kconfig | 2 +- arch/parisc/include/asm/perf_counter.h | 7 - arch/parisc/include/asm/perf_event.h | 7 + arch/parisc/include/asm/unistd.h | 4 +- arch/parisc/kernel/syscall_table.S | 2 +- arch/powerpc/Kconfig | 2 +- arch/powerpc/include/asm/hw_irq.h | 22 +- arch/powerpc/include/asm/paca.h | 2 +- arch/powerpc/include/asm/perf_counter.h | 110 - arch/powerpc/include/asm/perf_event.h | 110 + arch/powerpc/include/asm/systbl.h | 2 +- arch/powerpc/include/asm/unistd.h | 2 +- arch/powerpc/kernel/Makefile | 2 +- arch/powerpc/kernel/asm-offsets.c | 2 +- arch/powerpc/kernel/entry_64.S | 8 +- arch/powerpc/kernel/irq.c | 8 +- arch/powerpc/kernel/mpc7450-pmu.c | 2 +- arch/powerpc/kernel/perf_callchain.c | 2 +- arch/powerpc/kernel/perf_counter.c | 1315 -------- arch/powerpc/kernel/perf_event.c | 1315 ++++++++ arch/powerpc/kernel/power4-pmu.c | 2 +- arch/powerpc/kernel/power5+-pmu.c | 2 +- arch/powerpc/kernel/power5-pmu.c | 2 +- arch/powerpc/kernel/power6-pmu.c | 2 +- arch/powerpc/kernel/power7-pmu.c | 2 +- arch/powerpc/kernel/ppc970-pmu.c | 2 +- arch/powerpc/kernel/time.c | 30 +- arch/powerpc/mm/fault.c | 8 +- arch/powerpc/platforms/Kconfig.cputype | 4 +- arch/s390/Kconfig | 2 +- arch/s390/include/asm/perf_counter.h | 10 - arch/s390/include/asm/perf_event.h | 10 + arch/s390/include/asm/unistd.h | 2 +- arch/s390/kernel/compat_wrapper.S | 8 +- arch/s390/kernel/syscalls.S | 2 +- arch/s390/mm/fault.c | 8 +- arch/sh/Kconfig | 2 +- arch/sh/include/asm/perf_counter.h | 9 - arch/sh/include/asm/perf_event.h | 9 + arch/sh/include/asm/unistd_32.h | 2 +- arch/sh/include/asm/unistd_64.h | 2 +- arch/sh/kernel/syscalls_32.S | 2 +- arch/sh/kernel/syscalls_64.S | 2 +- arch/sh/mm/fault_32.c | 8 +- arch/sh/mm/tlbflush_64.c | 8 +- arch/sparc/Kconfig | 4 +- arch/sparc/include/asm/perf_counter.h | 14 - arch/sparc/include/asm/perf_event.h | 14 + arch/sparc/include/asm/unistd.h | 2 +- arch/sparc/kernel/Makefile | 2 +- arch/sparc/kernel/nmi.c | 4 +- arch/sparc/kernel/pcr.c | 10 +- arch/sparc/kernel/perf_counter.c | 556 ---- arch/sparc/kernel/perf_event.c | 556 ++++ arch/sparc/kernel/systbls_32.S | 2 +- arch/sparc/kernel/systbls_64.S | 4 +- arch/x86/Kconfig | 2 +- arch/x86/ia32/ia32entry.S | 2 +- arch/x86/include/asm/entry_arch.h | 2 +- arch/x86/include/asm/perf_counter.h | 108 - arch/x86/include/asm/perf_event.h | 108 + arch/x86/include/asm/unistd_32.h | 2 +- arch/x86/include/asm/unistd_64.h | 4 +- arch/x86/kernel/apic/apic.c | 6 +- arch/x86/kernel/cpu/Makefile | 2 +- arch/x86/kernel/cpu/common.c | 4 +- arch/x86/kernel/cpu/perf_counter.c | 2298 -------------- arch/x86/kernel/cpu/perf_event.c | 2298 ++++++++++++++ arch/x86/kernel/cpu/perfctr-watchdog.c | 2 +- arch/x86/kernel/entry_64.S | 2 +- arch/x86/kernel/irqinit.c | 2 +- arch/x86/kernel/syscall_table_32.S | 2 +- arch/x86/mm/fault.c | 8 +- arch/x86/oprofile/op_model_ppro.c | 4 +- arch/x86/oprofile/op_x86_model.h | 2 +- drivers/char/sysrq.c | 4 +- fs/exec.c | 6 +- include/asm-generic/unistd.h | 4 +- include/linux/init_task.h | 14 +- include/linux/perf_counter.h | 858 ------ include/linux/perf_event.h | 858 ++++++ include/linux/prctl.h | 4 +- include/linux/sched.h | 12 +- include/linux/syscalls.h | 6 +- include/trace/ftrace.h | 10 +- init/Kconfig | 8 +- kernel/Makefile | 2 +- kernel/exit.c | 8 +- kernel/fork.c | 8 +- kernel/perf_counter.c | 5000 ------------------------------- kernel/perf_event.c | 5000 +++++++++++++++++++++++++++++++ kernel/sched.c | 14 +- kernel/sys.c | 10 +- kernel/sys_ni.c | 2 +- kernel/sysctl.c | 22 +- kernel/timer.c | 4 +- kernel/trace/trace_syscalls.c | 6 +- mm/mmap.c | 6 +- mm/mprotect.c | 4 +- tools/perf/Makefile | 2 +- tools/perf/builtin-annotate.c | 28 +- tools/perf/builtin-record.c | 22 +- tools/perf/builtin-report.c | 48 +- tools/perf/builtin-sched.c | 20 +- tools/perf/builtin-stat.c | 10 +- tools/perf/builtin-timechart.c | 14 +- tools/perf/builtin-top.c | 12 +- tools/perf/builtin-trace.c | 22 +- tools/perf/design.txt | 58 +- tools/perf/perf.h | 12 +- tools/perf/util/event.h | 4 +- tools/perf/util/header.c | 6 +- tools/perf/util/header.h | 8 +- tools/perf/util/parse-events.c | 32 +- tools/perf/util/parse-events.h | 2 +- tools/perf/util/trace-event-info.c | 8 +- tools/perf/util/trace-event.h | 2 +- 141 files changed, 10694 insertions(+), 10694 deletions(-) delete mode 100644 arch/frv/include/asm/perf_counter.h create mode 100644 arch/frv/include/asm/perf_event.h delete mode 100644 arch/frv/lib/perf_counter.c create mode 100644 arch/frv/lib/perf_event.c delete mode 100644 arch/parisc/include/asm/perf_counter.h create mode 100644 arch/parisc/include/asm/perf_event.h delete mode 100644 arch/powerpc/include/asm/perf_counter.h create mode 100644 arch/powerpc/include/asm/perf_event.h delete mode 100644 arch/powerpc/kernel/perf_counter.c create mode 100644 arch/powerpc/kernel/perf_event.c delete mode 100644 arch/s390/include/asm/perf_counter.h create mode 100644 arch/s390/include/asm/perf_event.h delete mode 100644 arch/sh/include/asm/perf_counter.h create mode 100644 arch/sh/include/asm/perf_event.h delete mode 100644 arch/sparc/include/asm/perf_counter.h create mode 100644 arch/sparc/include/asm/perf_event.h delete mode 100644 arch/sparc/kernel/perf_counter.c create mode 100644 arch/sparc/kernel/perf_event.c delete mode 100644 arch/x86/include/asm/perf_counter.h create mode 100644 arch/x86/include/asm/perf_event.h delete mode 100644 arch/x86/kernel/cpu/perf_counter.c create mode 100644 arch/x86/kernel/cpu/perf_event.c delete mode 100644 include/linux/perf_counter.h create mode 100644 include/linux/perf_event.h delete mode 100644 kernel/perf_counter.c create mode 100644 kernel/perf_event.c (limited to 'arch/arm/kernel') diff --git a/arch/arm/include/asm/unistd.h b/arch/arm/include/asm/unistd.h index 9122c9ee18fb..89f7eade20af 100644 --- a/arch/arm/include/asm/unistd.h +++ b/arch/arm/include/asm/unistd.h @@ -390,7 +390,7 @@ #define __NR_preadv (__NR_SYSCALL_BASE+361) #define __NR_pwritev (__NR_SYSCALL_BASE+362) #define __NR_rt_tgsigqueueinfo (__NR_SYSCALL_BASE+363) -#define __NR_perf_counter_open (__NR_SYSCALL_BASE+364) +#define __NR_perf_event_open (__NR_SYSCALL_BASE+364) /* * The following SWIs are ARM private. diff --git a/arch/arm/kernel/calls.S b/arch/arm/kernel/calls.S index ecfa98954d1d..fafce1b5c69f 100644 --- a/arch/arm/kernel/calls.S +++ b/arch/arm/kernel/calls.S @@ -373,7 +373,7 @@ CALL(sys_preadv) CALL(sys_pwritev) CALL(sys_rt_tgsigqueueinfo) - CALL(sys_perf_counter_open) + CALL(sys_perf_event_open) #ifndef syscalls_counted .equ syscalls_padding, ((NR_syscalls + 3) & ~3) - NR_syscalls #define syscalls_counted diff --git a/arch/blackfin/include/asm/unistd.h b/arch/blackfin/include/asm/unistd.h index c8e7ee4768cd..02b1529dad57 100644 --- a/arch/blackfin/include/asm/unistd.h +++ b/arch/blackfin/include/asm/unistd.h @@ -381,7 +381,7 @@ #define __NR_preadv 366 #define __NR_pwritev 367 #define __NR_rt_tgsigqueueinfo 368 -#define __NR_perf_counter_open 369 +#define __NR_perf_event_open 369 #define __NR_syscall 370 #define NR_syscalls __NR_syscall diff --git a/arch/blackfin/mach-common/entry.S b/arch/blackfin/mach-common/entry.S index 01af24cde362..1e7cac23e25f 100644 --- a/arch/blackfin/mach-common/entry.S +++ b/arch/blackfin/mach-common/entry.S @@ -1620,7 +1620,7 @@ ENTRY(_sys_call_table) .long _sys_preadv .long _sys_pwritev .long _sys_rt_tgsigqueueinfo - .long _sys_perf_counter_open + .long _sys_perf_event_open .rept NR_syscalls-(.-_sys_call_table)/4 .long _sys_ni_syscall diff --git a/arch/frv/Kconfig b/arch/frv/Kconfig index b86e19c9b5b0..4b5830bcbe2e 100644 --- a/arch/frv/Kconfig +++ b/arch/frv/Kconfig @@ -7,7 +7,7 @@ config FRV default y select HAVE_IDE select HAVE_ARCH_TRACEHOOK - select HAVE_PERF_COUNTERS + select HAVE_PERF_EVENTS config ZONE_DMA bool diff --git a/arch/frv/include/asm/perf_counter.h b/arch/frv/include/asm/perf_counter.h deleted file mode 100644 index ccf726e61b2e..000000000000 --- a/arch/frv/include/asm/perf_counter.h +++ /dev/null @@ -1,17 +0,0 @@ -/* FRV performance counter support - * - * Copyright (C) 2009 Red Hat, Inc. All Rights Reserved. - * Written by David Howells (dhowells@redhat.com) - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public Licence - * as published by the Free Software Foundation; either version - * 2 of the Licence, or (at your option) any later version. - */ - -#ifndef _ASM_PERF_COUNTER_H -#define _ASM_PERF_COUNTER_H - -#define PERF_COUNTER_INDEX_OFFSET 0 - -#endif /* _ASM_PERF_COUNTER_H */ diff --git a/arch/frv/include/asm/perf_event.h b/arch/frv/include/asm/perf_event.h new file mode 100644 index 000000000000..a69e0155d146 --- /dev/null +++ b/arch/frv/include/asm/perf_event.h @@ -0,0 +1,17 @@ +/* FRV performance event support + * + * Copyright (C) 2009 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public Licence + * as published by the Free Software Foundation; either version + * 2 of the Licence, or (at your option) any later version. + */ + +#ifndef _ASM_PERF_EVENT_H +#define _ASM_PERF_EVENT_H + +#define PERF_EVENT_INDEX_OFFSET 0 + +#endif /* _ASM_PERF_EVENT_H */ diff --git a/arch/frv/include/asm/unistd.h b/arch/frv/include/asm/unistd.h index 4a8fb427ce0a..be6ef0f5cd42 100644 --- a/arch/frv/include/asm/unistd.h +++ b/arch/frv/include/asm/unistd.h @@ -342,7 +342,7 @@ #define __NR_preadv 333 #define __NR_pwritev 334 #define __NR_rt_tgsigqueueinfo 335 -#define __NR_perf_counter_open 336 +#define __NR_perf_event_open 336 #ifdef __KERNEL__ diff --git a/arch/frv/kernel/entry.S b/arch/frv/kernel/entry.S index fde1e446b440..189397ec012a 100644 --- a/arch/frv/kernel/entry.S +++ b/arch/frv/kernel/entry.S @@ -1525,6 +1525,6 @@ sys_call_table: .long sys_preadv .long sys_pwritev .long sys_rt_tgsigqueueinfo /* 335 */ - .long sys_perf_counter_open + .long sys_perf_event_open syscall_table_size = (. - sys_call_table) diff --git a/arch/frv/lib/Makefile b/arch/frv/lib/Makefile index 0a377210c89b..f4709756d0d9 100644 --- a/arch/frv/lib/Makefile +++ b/arch/frv/lib/Makefile @@ -5,4 +5,4 @@ lib-y := \ __ashldi3.o __lshrdi3.o __muldi3.o __ashrdi3.o __negdi2.o __ucmpdi2.o \ checksum.o memcpy.o memset.o atomic-ops.o atomic64-ops.o \ - outsl_ns.o outsl_sw.o insl_ns.o insl_sw.o cache.o perf_counter.o + outsl_ns.o outsl_sw.o insl_ns.o insl_sw.o cache.o perf_event.o diff --git a/arch/frv/lib/perf_counter.c b/arch/frv/lib/perf_counter.c deleted file mode 100644 index 2000feecd571..000000000000 --- a/arch/frv/lib/perf_counter.c +++ /dev/null @@ -1,19 +0,0 @@ -/* Performance counter handling - * - * Copyright (C) 2009 Red Hat, Inc. All Rights Reserved. - * Written by David Howells (dhowells@redhat.com) - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public Licence - * as published by the Free Software Foundation; either version - * 2 of the Licence, or (at your option) any later version. - */ - -#include - -/* - * mark the performance counter as pending - */ -void set_perf_counter_pending(void) -{ -} diff --git a/arch/frv/lib/perf_event.c b/arch/frv/lib/perf_event.c new file mode 100644 index 000000000000..9ac5acfd2e91 --- /dev/null +++ b/arch/frv/lib/perf_event.c @@ -0,0 +1,19 @@ +/* Performance event handling + * + * Copyright (C) 2009 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public Licence + * as published by the Free Software Foundation; either version + * 2 of the Licence, or (at your option) any later version. + */ + +#include + +/* + * mark the performance event as pending + */ +void set_perf_event_pending(void) +{ +} diff --git a/arch/m68k/include/asm/unistd.h b/arch/m68k/include/asm/unistd.h index 946d8691f2b0..48b87f5ced50 100644 --- a/arch/m68k/include/asm/unistd.h +++ b/arch/m68k/include/asm/unistd.h @@ -335,7 +335,7 @@ #define __NR_preadv 329 #define __NR_pwritev 330 #define __NR_rt_tgsigqueueinfo 331 -#define __NR_perf_counter_open 332 +#define __NR_perf_event_open 332 #ifdef __KERNEL__ diff --git a/arch/m68k/kernel/entry.S b/arch/m68k/kernel/entry.S index 922f52e7ed1a..c5b33634c980 100644 --- a/arch/m68k/kernel/entry.S +++ b/arch/m68k/kernel/entry.S @@ -756,5 +756,5 @@ sys_call_table: .long sys_preadv .long sys_pwritev /* 330 */ .long sys_rt_tgsigqueueinfo - .long sys_perf_counter_open + .long sys_perf_event_open diff --git a/arch/m68knommu/kernel/syscalltable.S b/arch/m68knommu/kernel/syscalltable.S index 0ae123e08985..23535cc415ae 100644 --- a/arch/m68knommu/kernel/syscalltable.S +++ b/arch/m68knommu/kernel/syscalltable.S @@ -350,7 +350,7 @@ ENTRY(sys_call_table) .long sys_preadv .long sys_pwritev /* 330 */ .long sys_rt_tgsigqueueinfo - .long sys_perf_counter_open + .long sys_perf_event_open .rept NR_syscalls-(.-sys_call_table)/4 .long sys_ni_syscall diff --git a/arch/microblaze/include/asm/unistd.h b/arch/microblaze/include/asm/unistd.h index 0b852327c0e7..cb05a07e55e9 100644 --- a/arch/microblaze/include/asm/unistd.h +++ b/arch/microblaze/include/asm/unistd.h @@ -381,7 +381,7 @@ #define __NR_preadv 363 /* new */ #define __NR_pwritev 364 /* new */ #define __NR_rt_tgsigqueueinfo 365 /* new */ -#define __NR_perf_counter_open 366 /* new */ +#define __NR_perf_event_open 366 /* new */ #define __NR_syscalls 367 diff --git a/arch/microblaze/kernel/syscall_table.S b/arch/microblaze/kernel/syscall_table.S index 457216097dfd..ecec19155135 100644 --- a/arch/microblaze/kernel/syscall_table.S +++ b/arch/microblaze/kernel/syscall_table.S @@ -370,4 +370,4 @@ ENTRY(sys_call_table) .long sys_ni_syscall .long sys_ni_syscall .long sys_rt_tgsigqueueinfo /* 365 */ - .long sys_perf_counter_open + .long sys_perf_event_open diff --git a/arch/mips/include/asm/unistd.h b/arch/mips/include/asm/unistd.h index e753a777949b..8c9dfa9e9018 100644 --- a/arch/mips/include/asm/unistd.h +++ b/arch/mips/include/asm/unistd.h @@ -353,7 +353,7 @@ #define __NR_preadv (__NR_Linux + 330) #define __NR_pwritev (__NR_Linux + 331) #define __NR_rt_tgsigqueueinfo (__NR_Linux + 332) -#define __NR_perf_counter_open (__NR_Linux + 333) +#define __NR_perf_event_open (__NR_Linux + 333) #define __NR_accept4 (__NR_Linux + 334) /* @@ -664,7 +664,7 @@ #define __NR_preadv (__NR_Linux + 289) #define __NR_pwritev (__NR_Linux + 290) #define __NR_rt_tgsigqueueinfo (__NR_Linux + 291) -#define __NR_perf_counter_open (__NR_Linux + 292) +#define __NR_perf_event_open (__NR_Linux + 292) #define __NR_accept4 (__NR_Linux + 293) /* @@ -979,7 +979,7 @@ #define __NR_preadv (__NR_Linux + 293) #define __NR_pwritev (__NR_Linux + 294) #define __NR_rt_tgsigqueueinfo (__NR_Linux + 295) -#define __NR_perf_counter_open (__NR_Linux + 296) +#define __NR_perf_event_open (__NR_Linux + 296) #define __NR_accept4 (__NR_Linux + 297) /* diff --git a/arch/mips/kernel/scall32-o32.S b/arch/mips/kernel/scall32-o32.S index 7c2de4f091c4..fd2a9bb620d6 100644 --- a/arch/mips/kernel/scall32-o32.S +++ b/arch/mips/kernel/scall32-o32.S @@ -581,7 +581,7 @@ einval: li v0, -ENOSYS sys sys_preadv 6 /* 4330 */ sys sys_pwritev 6 sys sys_rt_tgsigqueueinfo 4 - sys sys_perf_counter_open 5 + sys sys_perf_event_open 5 sys sys_accept4 4 .endm diff --git a/arch/mips/kernel/scall64-64.S b/arch/mips/kernel/scall64-64.S index b97b993846d6..18bf7f32c5e4 100644 --- a/arch/mips/kernel/scall64-64.S +++ b/arch/mips/kernel/scall64-64.S @@ -418,6 +418,6 @@ sys_call_table: PTR sys_preadv PTR sys_pwritev /* 5390 */ PTR sys_rt_tgsigqueueinfo - PTR sys_perf_counter_open + PTR sys_perf_event_open PTR sys_accept4 .size sys_call_table,.-sys_call_table diff --git a/arch/mips/kernel/scall64-n32.S b/arch/mips/kernel/scall64-n32.S index 1a6ae124635b..6ebc07976694 100644 --- a/arch/mips/kernel/scall64-n32.S +++ b/arch/mips/kernel/scall64-n32.S @@ -416,6 +416,6 @@ EXPORT(sysn32_call_table) PTR sys_preadv PTR sys_pwritev PTR compat_sys_rt_tgsigqueueinfo /* 5295 */ - PTR sys_perf_counter_open + PTR sys_perf_event_open PTR sys_accept4 .size sysn32_call_table,.-sysn32_call_table diff --git a/arch/mips/kernel/scall64-o32.S b/arch/mips/kernel/scall64-o32.S index cd31087a651f..9bbf9775e0bd 100644 --- a/arch/mips/kernel/scall64-o32.S +++ b/arch/mips/kernel/scall64-o32.S @@ -536,6 +536,6 @@ sys_call_table: PTR compat_sys_preadv /* 4330 */ PTR compat_sys_pwritev PTR compat_sys_rt_tgsigqueueinfo - PTR sys_perf_counter_open + PTR sys_perf_event_open PTR sys_accept4 .size sys_call_table,.-sys_call_table diff --git a/arch/mn10300/include/asm/unistd.h b/arch/mn10300/include/asm/unistd.h index fad68616af32..2a983931c11f 100644 --- a/arch/mn10300/include/asm/unistd.h +++ b/arch/mn10300/include/asm/unistd.h @@ -347,7 +347,7 @@ #define __NR_preadv 334 #define __NR_pwritev 335 #define __NR_rt_tgsigqueueinfo 336 -#define __NR_perf_counter_open 337 +#define __NR_perf_event_open 337 #ifdef __KERNEL__ diff --git a/arch/mn10300/kernel/entry.S b/arch/mn10300/kernel/entry.S index e0d2563af4f2..a94e7ea3faa6 100644 --- a/arch/mn10300/kernel/entry.S +++ b/arch/mn10300/kernel/entry.S @@ -723,7 +723,7 @@ ENTRY(sys_call_table) .long sys_preadv .long sys_pwritev /* 335 */ .long sys_rt_tgsigqueueinfo - .long sys_perf_counter_open + .long sys_perf_event_open nr_syscalls=(.-sys_call_table)/4 diff --git a/arch/parisc/Kconfig b/arch/parisc/Kconfig index 06f8d5b5b0f9..f388dc68f605 100644 --- a/arch/parisc/Kconfig +++ b/arch/parisc/Kconfig @@ -16,7 +16,7 @@ config PARISC select RTC_DRV_GENERIC select INIT_ALL_POSSIBLE select BUG - select HAVE_PERF_COUNTERS + select HAVE_PERF_EVENTS select GENERIC_ATOMIC64 if !64BIT help The PA-RISC microprocessor is designed by Hewlett-Packard and used diff --git a/arch/parisc/include/asm/perf_counter.h b/arch/parisc/include/asm/perf_counter.h deleted file mode 100644 index dc9e829f7013..000000000000 --- a/arch/parisc/include/asm/perf_counter.h +++ /dev/null @@ -1,7 +0,0 @@ -#ifndef __ASM_PARISC_PERF_COUNTER_H -#define __ASM_PARISC_PERF_COUNTER_H - -/* parisc only supports software counters through this interface. */ -static inline void set_perf_counter_pending(void) { } - -#endif /* __ASM_PARISC_PERF_COUNTER_H */ diff --git a/arch/parisc/include/asm/perf_event.h b/arch/parisc/include/asm/perf_event.h new file mode 100644 index 000000000000..cc146427d8f9 --- /dev/null +++ b/arch/parisc/include/asm/perf_event.h @@ -0,0 +1,7 @@ +#ifndef __ASM_PARISC_PERF_EVENT_H +#define __ASM_PARISC_PERF_EVENT_H + +/* parisc only supports software events through this interface. */ +static inline void set_perf_event_pending(void) { } + +#endif /* __ASM_PARISC_PERF_EVENT_H */ diff --git a/arch/parisc/include/asm/unistd.h b/arch/parisc/include/asm/unistd.h index f3d3b8b012c4..cda158318c62 100644 --- a/arch/parisc/include/asm/unistd.h +++ b/arch/parisc/include/asm/unistd.h @@ -810,9 +810,9 @@ #define __NR_preadv (__NR_Linux + 315) #define __NR_pwritev (__NR_Linux + 316) #define __NR_rt_tgsigqueueinfo (__NR_Linux + 317) -#define __NR_perf_counter_open (__NR_Linux + 318) +#define __NR_perf_event_open (__NR_Linux + 318) -#define __NR_Linux_syscalls (__NR_perf_counter_open + 1) +#define __NR_Linux_syscalls (__NR_perf_event_open + 1) #define __IGNORE_select /* newselect */ diff --git a/arch/parisc/kernel/syscall_table.S b/arch/parisc/kernel/syscall_table.S index cf145eb026b3..843f423dec67 100644 --- a/arch/parisc/kernel/syscall_table.S +++ b/arch/parisc/kernel/syscall_table.S @@ -416,7 +416,7 @@ ENTRY_COMP(preadv) /* 315 */ ENTRY_COMP(pwritev) ENTRY_COMP(rt_tgsigqueueinfo) - ENTRY_SAME(perf_counter_open) + ENTRY_SAME(perf_event_open) /* Nothing yet */ diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index 8250902265c6..4fd479059d65 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -129,7 +129,7 @@ config PPC select HAVE_OPROFILE select HAVE_SYSCALL_WRAPPERS if PPC64 select GENERIC_ATOMIC64 if PPC32 - select HAVE_PERF_COUNTERS + select HAVE_PERF_EVENTS config EARLY_PRINTK bool diff --git a/arch/powerpc/include/asm/hw_irq.h b/arch/powerpc/include/asm/hw_irq.h index e73d554538dd..abbc2aaaced5 100644 --- a/arch/powerpc/include/asm/hw_irq.h +++ b/arch/powerpc/include/asm/hw_irq.h @@ -135,43 +135,43 @@ static inline int irqs_disabled_flags(unsigned long flags) */ struct irq_chip; -#ifdef CONFIG_PERF_COUNTERS +#ifdef CONFIG_PERF_EVENTS #ifdef CONFIG_PPC64 -static inline unsigned long test_perf_counter_pending(void) +static inline unsigned long test_perf_event_pending(void) { unsigned long x; asm volatile("lbz %0,%1(13)" : "=r" (x) - : "i" (offsetof(struct paca_struct, perf_counter_pending))); + : "i" (offsetof(struct paca_struct, perf_event_pending))); return x; } -static inline void set_perf_counter_pending(void) +static inline void set_perf_event_pending(void) { asm volatile("stb %0,%1(13)" : : "r" (1), - "i" (offsetof(struct paca_struct, perf_counter_pending))); + "i" (offsetof(struct paca_struct, perf_event_pending))); } -static inline void clear_perf_counter_pending(void) +static inline void clear_perf_event_pending(void) { asm volatile("stb %0,%1(13)" : : "r" (0), - "i" (offsetof(struct paca_struct, perf_counter_pending))); + "i" (offsetof(struct paca_struct, perf_event_pending))); } #endif /* CONFIG_PPC64 */ -#else /* CONFIG_PERF_COUNTERS */ +#else /* CONFIG_PERF_EVENTS */ -static inline unsigned long test_perf_counter_pending(void) +static inline unsigned long test_perf_event_pending(void) { return 0; } -static inline void clear_perf_counter_pending(void) {} -#endif /* CONFIG_PERF_COUNTERS */ +static inline void clear_perf_event_pending(void) {} +#endif /* CONFIG_PERF_EVENTS */ #endif /* __KERNEL__ */ #endif /* _ASM_POWERPC_HW_IRQ_H */ diff --git a/arch/powerpc/include/asm/paca.h b/arch/powerpc/include/asm/paca.h index b634456ea893..154f405b642f 100644 --- a/arch/powerpc/include/asm/paca.h +++ b/arch/powerpc/include/asm/paca.h @@ -122,7 +122,7 @@ struct paca_struct { u8 soft_enabled; /* irq soft-enable flag */ u8 hard_enabled; /* set if irqs are enabled in MSR */ u8 io_sync; /* writel() needs spin_unlock sync */ - u8 perf_counter_pending; /* PM interrupt while soft-disabled */ + u8 perf_event_pending; /* PM interrupt while soft-disabled */ /* Stuff for accurate time accounting */ u64 user_time; /* accumulated usermode TB ticks */ diff --git a/arch/powerpc/include/asm/perf_counter.h b/arch/powerpc/include/asm/perf_counter.h deleted file mode 100644 index 0ea0639fcf75..000000000000 --- a/arch/powerpc/include/asm/perf_counter.h +++ /dev/null @@ -1,110 +0,0 @@ -/* - * Performance counter support - PowerPC-specific definitions. - * - * Copyright 2008-2009 Paul Mackerras, IBM Corporation. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - */ -#include - -#include - -#define MAX_HWCOUNTERS 8 -#define MAX_EVENT_ALTERNATIVES 8 -#define MAX_LIMITED_HWCOUNTERS 2 - -/* - * This struct provides the constants and functions needed to - * describe the PMU on a particular POWER-family CPU. - */ -struct power_pmu { - const char *name; - int n_counter; - int max_alternatives; - unsigned long add_fields; - unsigned long test_adder; - int (*compute_mmcr)(u64 events[], int n_ev, - unsigned int hwc[], unsigned long mmcr[]); - int (*get_constraint)(u64 event, unsigned long *mskp, - unsigned long *valp); - int (*get_alternatives)(u64 event, unsigned int flags, - u64 alt[]); - void (*disable_pmc)(unsigned int pmc, unsigned long mmcr[]); - int (*limited_pmc_event)(u64 event); - u32 flags; - int n_generic; - int *generic_events; - int (*cache_events)[PERF_COUNT_HW_CACHE_MAX] - [PERF_COUNT_HW_CACHE_OP_MAX] - [PERF_COUNT_HW_CACHE_RESULT_MAX]; -}; - -/* - * Values for power_pmu.flags - */ -#define PPMU_LIMITED_PMC5_6 1 /* PMC5/6 have limited function */ -#define PPMU_ALT_SIPR 2 /* uses alternate posn for SIPR/HV */ - -/* - * Values for flags to get_alternatives() - */ -#define PPMU_LIMITED_PMC_OK 1 /* can put this on a limited PMC */ -#define PPMU_LIMITED_PMC_REQD 2 /* have to put this on a limited PMC */ -#define PPMU_ONLY_COUNT_RUN 4 /* only counting in run state */ - -extern int register_power_pmu(struct power_pmu *); - -struct pt_regs; -extern unsigned long perf_misc_flags(struct pt_regs *regs); -extern unsigned long perf_instruction_pointer(struct pt_regs *regs); - -#define PERF_COUNTER_INDEX_OFFSET 1 - -/* - * Only override the default definitions in include/linux/perf_counter.h - * if we have hardware PMU support. - */ -#ifdef CONFIG_PPC_PERF_CTRS -#define perf_misc_flags(regs) perf_misc_flags(regs) -#endif - -/* - * The power_pmu.get_constraint function returns a 32/64-bit value and - * a 32/64-bit mask that express the constraints between this event and - * other events. - * - * The value and mask are divided up into (non-overlapping) bitfields - * of three different types: - * - * Select field: this expresses the constraint that some set of bits - * in MMCR* needs to be set to a specific value for this event. For a - * select field, the mask contains 1s in every bit of the field, and - * the value contains a unique value for each possible setting of the - * MMCR* bits. The constraint checking code will ensure that two events - * that set the same field in their masks have the same value in their - * value dwords. - * - * Add field: this expresses the constraint that there can be at most - * N events in a particular class. A field of k bits can be used for - * N <= 2^(k-1) - 1. The mask has the most significant bit of the field - * set (and the other bits 0), and the value has only the least significant - * bit of the field set. In addition, the 'add_fields' and 'test_adder' - * in the struct power_pmu for this processor come into play. The - * add_fields value contains 1 in the LSB of the field, and the - * test_adder contains 2^(k-1) - 1 - N in the field. - * - * NAND field: this expresses the constraint that you may not have events - * in all of a set of classes. (For example, on PPC970, you can't select - * events from the FPU, ISU and IDU simultaneously, although any two are - * possible.) For N classes, the field is N+1 bits wide, and each class - * is assigned one bit from the least-significant N bits. The mask has - * only the most-significant bit set, and the value has only the bit - * for the event's class set. The test_adder has the least significant - * bit set in the field. - * - * If an event is not subject to the constraint expressed by a particular - * field, then it will have 0 in both the mask and value for that field. - */ diff --git a/arch/powerpc/include/asm/perf_event.h b/arch/powerpc/include/asm/perf_event.h new file mode 100644 index 000000000000..2499aaadaeb9 --- /dev/null +++ b/arch/powerpc/include/asm/perf_event.h @@ -0,0 +1,110 @@ +/* + * Performance event support - PowerPC-specific definitions. + * + * Copyright 2008-2009 Paul Mackerras, IBM Corporation. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ +#include + +#include + +#define MAX_HWEVENTS 8 +#define MAX_EVENT_ALTERNATIVES 8 +#define MAX_LIMITED_HWEVENTS 2 + +/* + * This struct provides the constants and functions needed to + * describe the PMU on a particular POWER-family CPU. + */ +struct power_pmu { + const char *name; + int n_event; + int max_alternatives; + unsigned long add_fields; + unsigned long test_adder; + int (*compute_mmcr)(u64 events[], int n_ev, + unsigned int hwc[], unsigned long mmcr[]); + int (*get_constraint)(u64 event_id, unsigned long *mskp, + unsigned long *valp); + int (*get_alternatives)(u64 event_id, unsigned int flags, + u64 alt[]); + void (*disable_pmc)(unsigned int pmc, unsigned long mmcr[]); + int (*limited_pmc_event)(u64 event_id); + u32 flags; + int n_generic; + int *generic_events; + int (*cache_events)[PERF_COUNT_HW_CACHE_MAX] + [PERF_COUNT_HW_CACHE_OP_MAX] + [PERF_COUNT_HW_CACHE_RESULT_MAX]; +}; + +/* + * Values for power_pmu.flags + */ +#define PPMU_LIMITED_PMC5_6 1 /* PMC5/6 have limited function */ +#define PPMU_ALT_SIPR 2 /* uses alternate posn for SIPR/HV */ + +/* + * Values for flags to get_alternatives() + */ +#define PPMU_LIMITED_PMC_OK 1 /* can put this on a limited PMC */ +#define PPMU_LIMITED_PMC_REQD 2 /* have to put this on a limited PMC */ +#define PPMU_ONLY_COUNT_RUN 4 /* only counting in run state */ + +extern int register_power_pmu(struct power_pmu *); + +struct pt_regs; +extern unsigned long perf_misc_flags(struct pt_regs *regs); +extern unsigned long perf_instruction_pointer(struct pt_regs *regs); + +#define PERF_EVENT_INDEX_OFFSET 1 + +/* + * Only override the default definitions in include/linux/perf_event.h + * if we have hardware PMU support. + */ +#ifdef CONFIG_PPC_PERF_CTRS +#define perf_misc_flags(regs) perf_misc_flags(regs) +#endif + +/* + * The power_pmu.get_constraint function returns a 32/64-bit value and + * a 32/64-bit mask that express the constraints between this event_id and + * other events. + * + * The value and mask are divided up into (non-overlapping) bitfields + * of three different types: + * + * Select field: this expresses the constraint that some set of bits + * in MMCR* needs to be set to a specific value for this event_id. For a + * select field, the mask contains 1s in every bit of the field, and + * the value contains a unique value for each possible setting of the + * MMCR* bits. The constraint checking code will ensure that two events + * that set the same field in their masks have the same value in their + * value dwords. + * + * Add field: this expresses the constraint that there can be at most + * N events in a particular class. A field of k bits can be used for + * N <= 2^(k-1) - 1. The mask has the most significant bit of the field + * set (and the other bits 0), and the value has only the least significant + * bit of the field set. In addition, the 'add_fields' and 'test_adder' + * in the struct power_pmu for this processor come into play. The + * add_fields value contains 1 in the LSB of the field, and the + * test_adder contains 2^(k-1) - 1 - N in the field. + * + * NAND field: this expresses the constraint that you may not have events + * in all of a set of classes. (For example, on PPC970, you can't select + * events from the FPU, ISU and IDU simultaneously, although any two are + * possible.) For N classes, the field is N+1 bits wide, and each class + * is assigned one bit from the least-significant N bits. The mask has + * only the most-significant bit set, and the value has only the bit + * for the event_id's class set. The test_adder has the least significant + * bit set in the field. + * + * If an event_id is not subject to the constraint expressed by a particular + * field, then it will have 0 in both the mask and value for that field. + */ diff --git a/arch/powerpc/include/asm/systbl.h b/arch/powerpc/include/asm/systbl.h index ed24bd92fe49..c7d671a7d9a1 100644 --- a/arch/powerpc/include/asm/systbl.h +++ b/arch/powerpc/include/asm/systbl.h @@ -322,7 +322,7 @@ SYSCALL_SPU(epoll_create1) SYSCALL_SPU(dup3) SYSCALL_SPU(pipe2) SYSCALL(inotify_init1) -SYSCALL_SPU(perf_counter_open) +SYSCALL_SPU(perf_event_open) COMPAT_SYS_SPU(preadv) COMPAT_SYS_SPU(pwritev) COMPAT_SYS(rt_tgsigqueueinfo) diff --git a/arch/powerpc/include/asm/unistd.h b/arch/powerpc/include/asm/unistd.h index cef080bfc607..f6ca76176766 100644 --- a/arch/powerpc/include/asm/unistd.h +++ b/arch/powerpc/include/asm/unistd.h @@ -341,7 +341,7 @@ #define __NR_dup3 316 #define __NR_pipe2 317 #define __NR_inotify_init1 318 -#define __NR_perf_counter_open 319 +#define __NR_perf_event_open 319 #define __NR_preadv 320 #define __NR_pwritev 321 #define __NR_rt_tgsigqueueinfo 322 diff --git a/arch/powerpc/kernel/Makefile b/arch/powerpc/kernel/Makefile index 569f79ccd310..b23664a0b86c 100644 --- a/arch/powerpc/kernel/Makefile +++ b/arch/powerpc/kernel/Makefile @@ -97,7 +97,7 @@ obj64-$(CONFIG_AUDIT) += compat_audit.o obj-$(CONFIG_DYNAMIC_FTRACE) += ftrace.o obj-$(CONFIG_FUNCTION_GRAPH_TRACER) += ftrace.o -obj-$(CONFIG_PPC_PERF_CTRS) += perf_counter.o perf_callchain.o +obj-$(CONFIG_PPC_PERF_CTRS) += perf_event.o perf_callchain.o obj64-$(CONFIG_PPC_PERF_CTRS) += power4-pmu.o ppc970-pmu.o power5-pmu.o \ power5+-pmu.o power6-pmu.o power7-pmu.o obj32-$(CONFIG_PPC_PERF_CTRS) += mpc7450-pmu.o diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c index f0df285f0f87..0812b0f414bb 100644 --- a/arch/powerpc/kernel/asm-offsets.c +++ b/arch/powerpc/kernel/asm-offsets.c @@ -133,7 +133,7 @@ int main(void) DEFINE(PACAKMSR, offsetof(struct paca_struct, kernel_msr)); DEFINE(PACASOFTIRQEN, offsetof(struct paca_struct, soft_enabled)); DEFINE(PACAHARDIRQEN, offsetof(struct paca_struct, hard_enabled)); - DEFINE(PACAPERFPEND, offsetof(struct paca_struct, perf_counter_pending)); + DEFINE(PACAPERFPEND, offsetof(struct paca_struct, perf_event_pending)); DEFINE(PACACONTEXTID, offsetof(struct paca_struct, context.id)); #ifdef CONFIG_PPC_MM_SLICES DEFINE(PACALOWSLICESPSIZE, offsetof(struct paca_struct, diff --git a/arch/powerpc/kernel/entry_64.S b/arch/powerpc/kernel/entry_64.S index 66bcda34a6bb..900e0eea0099 100644 --- a/arch/powerpc/kernel/entry_64.S +++ b/arch/powerpc/kernel/entry_64.S @@ -556,14 +556,14 @@ ALT_FW_FTR_SECTION_END_IFCLR(FW_FEATURE_ISERIES) 2: TRACE_AND_RESTORE_IRQ(r5); -#ifdef CONFIG_PERF_COUNTERS - /* check paca->perf_counter_pending if we're enabling ints */ +#ifdef CONFIG_PERF_EVENTS + /* check paca->perf_event_pending if we're enabling ints */ lbz r3,PACAPERFPEND(r13) and. r3,r3,r5 beq 27f - bl .perf_counter_do_pending + bl .perf_event_do_pending 27: -#endif /* CONFIG_PERF_COUNTERS */ +#endif /* CONFIG_PERF_EVENTS */ /* extract EE bit and use it to restore paca->hard_enabled */ ld r3,_MSR(r1) diff --git a/arch/powerpc/kernel/irq.c b/arch/powerpc/kernel/irq.c index f7f376ea7b17..e5d121177984 100644 --- a/arch/powerpc/kernel/irq.c +++ b/arch/powerpc/kernel/irq.c @@ -53,7 +53,7 @@ #include #include #include -#include +#include #include #include @@ -138,9 +138,9 @@ notrace void raw_local_irq_restore(unsigned long en) } #endif /* CONFIG_PPC_STD_MMU_64 */ - if (test_perf_counter_pending()) { - clear_perf_counter_pending(); - perf_counter_do_pending(); + if (test_perf_event_pending()) { + clear_perf_event_pending(); + perf_event_do_pending(); } /* diff --git a/arch/powerpc/kernel/mpc7450-pmu.c b/arch/powerpc/kernel/mpc7450-pmu.c index cc466d039af6..09d72028f317 100644 --- a/arch/powerpc/kernel/mpc7450-pmu.c +++ b/arch/powerpc/kernel/mpc7450-pmu.c @@ -9,7 +9,7 @@ * 2 of the License, or (at your option) any later version. */ #include -#include +#include #include #include diff --git a/arch/powerpc/kernel/perf_callchain.c b/arch/powerpc/kernel/perf_callchain.c index f74b62c67511..0a03cf70d247 100644 --- a/arch/powerpc/kernel/perf_callchain.c +++ b/arch/powerpc/kernel/perf_callchain.c @@ -10,7 +10,7 @@ */ #include #include -#include +#include #include #include #include diff --git a/arch/powerpc/kernel/perf_counter.c b/arch/powerpc/kernel/perf_counter.c deleted file mode 100644 index 5ccf9bca96c0..000000000000 --- a/arch/powerpc/kernel/perf_counter.c +++ /dev/null @@ -1,1315 +0,0 @@ -/* - * Performance counter support - powerpc architecture code - * - * Copyright 2008-2009 Paul Mackerras, IBM Corporation. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - */ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -struct cpu_hw_counters { - int n_counters; - int n_percpu; - int disabled; - int n_added; - int n_limited; - u8 pmcs_enabled; - struct perf_counter *counter[MAX_HWCOUNTERS]; - u64 events[MAX_HWCOUNTERS]; - unsigned int flags[MAX_HWCOUNTERS]; - unsigned long mmcr[3]; - struct perf_counter *limited_counter[MAX_LIMITED_HWCOUNTERS]; - u8 limited_hwidx[MAX_LIMITED_HWCOUNTERS]; - u64 alternatives[MAX_HWCOUNTERS][MAX_EVENT_ALTERNATIVES]; - unsigned long amasks[MAX_HWCOUNTERS][MAX_EVENT_ALTERNATIVES]; - unsigned long avalues[MAX_HWCOUNTERS][MAX_EVENT_ALTERNATIVES]; -}; -DEFINE_PER_CPU(struct cpu_hw_counters, cpu_hw_counters); - -struct power_pmu *ppmu; - -/* - * Normally, to ignore kernel events we set the FCS (freeze counters - * in supervisor mode) bit in MMCR0, but if the kernel runs with the - * hypervisor bit set in the MSR, or if we are running on a processor - * where the hypervisor bit is forced to 1 (as on Apple G5 processors), - * then we need to use the FCHV bit to ignore kernel events. - */ -static unsigned int freeze_counters_kernel = MMCR0_FCS; - -/* - * 32-bit doesn't have MMCRA but does have an MMCR2, - * and a few other names are different. - */ -#ifdef CONFIG_PPC32 - -#define MMCR0_FCHV 0 -#define MMCR0_PMCjCE MMCR0_PMCnCE - -#define SPRN_MMCRA SPRN_MMCR2 -#define MMCRA_SAMPLE_ENABLE 0 - -static inline unsigned long perf_ip_adjust(struct pt_regs *regs) -{ - return 0; -} -static inline void perf_get_data_addr(struct pt_regs *regs, u64 *addrp) { } -static inline u32 perf_get_misc_flags(struct pt_regs *regs) -{ - return 0; -} -static inline void perf_read_regs(struct pt_regs *regs) { } -static inline int perf_intr_is_nmi(struct pt_regs *regs) -{ - return 0; -} - -#endif /* CONFIG_PPC32 */ - -/* - * Things that are specific to 64-bit implementations. - */ -#ifdef CONFIG_PPC64 - -static inline unsigned long perf_ip_adjust(struct pt_regs *regs) -{ - unsigned long mmcra = regs->dsisr; - - if ((mmcra & MMCRA_SAMPLE_ENABLE) && !(ppmu->flags & PPMU_ALT_SIPR)) { - unsigned long slot = (mmcra & MMCRA_SLOT) >> MMCRA_SLOT_SHIFT; - if (slot > 1) - return 4 * (slot - 1); - } - return 0; -} - -/* - * The user wants a data address recorded. - * If we're not doing instruction sampling, give them the SDAR - * (sampled data address). If we are doing instruction sampling, then - * only give them the SDAR if it corresponds to the instruction - * pointed to by SIAR; this is indicated by the [POWER6_]MMCRA_SDSYNC - * bit in MMCRA. - */ -static inline void perf_get_data_addr(struct pt_regs *regs, u64 *addrp) -{ - unsigned long mmcra = regs->dsisr; - unsigned long sdsync = (ppmu->flags & PPMU_ALT_SIPR) ? - POWER6_MMCRA_SDSYNC : MMCRA_SDSYNC; - - if (!(mmcra & MMCRA_SAMPLE_ENABLE) || (mmcra & sdsync)) - *addrp = mfspr(SPRN_SDAR); -} - -static inline u32 perf_get_misc_flags(struct pt_regs *regs) -{ - unsigned long mmcra = regs->dsisr; - - if (TRAP(regs) != 0xf00) - return 0; /* not a PMU interrupt */ - - if (ppmu->flags & PPMU_ALT_SIPR) { - if (mmcra & POWER6_MMCRA_SIHV) - return PERF_EVENT_MISC_HYPERVISOR; - return (mmcra & POWER6_MMCRA_SIPR) ? - PERF_EVENT_MISC_USER : PERF_EVENT_MISC_KERNEL; - } - if (mmcra & MMCRA_SIHV) - return PERF_EVENT_MISC_HYPERVISOR; - return (mmcra & MMCRA_SIPR) ? PERF_EVENT_MISC_USER : - PERF_EVENT_MISC_KERNEL; -} - -/* - * Overload regs->dsisr to store MMCRA so we only need to read it once - * on each interrupt. - */ -static inline void perf_read_regs(struct pt_regs *regs) -{ - regs->dsisr = mfspr(SPRN_MMCRA); -} - -/* - * If interrupts were soft-disabled when a PMU interrupt occurs, treat - * it as an NMI. - */ -static inline int perf_intr_is_nmi(struct pt_regs *regs) -{ - return !regs->softe; -} - -#endif /* CONFIG_PPC64 */ - -static void perf_counter_interrupt(struct pt_regs *regs); - -void perf_counter_print_debug(void) -{ -} - -/* - * Read one performance monitor counter (PMC). - */ -static unsigned long read_pmc(int idx) -{ - unsigned long val; - - switch (idx) { - case 1: - val = mfspr(SPRN_PMC1); - break; - case 2: - val = mfspr(SPRN_PMC2); - break; - case 3: - val = mfspr(SPRN_PMC3); - break; - case 4: - val = mfspr(SPRN_PMC4); - break; - case 5: - val = mfspr(SPRN_PMC5); - break; - case 6: - val = mfspr(SPRN_PMC6); - break; -#ifdef CONFIG_PPC64 - case 7: - val = mfspr(SPRN_PMC7); - break; - case 8: - val = mfspr(SPRN_PMC8); - break; -#endif /* CONFIG_PPC64 */ - default: - printk(KERN_ERR "oops trying to read PMC%d\n", idx); - val = 0; - } - return val; -} - -/* - * Write one PMC. - */ -static void write_pmc(int idx, unsigned long val) -{ - switch (idx) { - case 1: - mtspr(SPRN_PMC1, val); - break; - case 2: - mtspr(SPRN_PMC2, val); - break; - case 3: - mtspr(SPRN_PMC3, val); - break; - case 4: - mtspr(SPRN_PMC4, val); - break; - case 5: - mtspr(SPRN_PMC5, val); - break; - case 6: - mtspr(SPRN_PMC6, val); - break; -#ifdef CONFIG_PPC64 - case 7: - mtspr(SPRN_PMC7, val); - break; - case 8: - mtspr(SPRN_PMC8, val); - break; -#endif /* CONFIG_PPC64 */ - default: - printk(KERN_ERR "oops trying to write PMC%d\n", idx); - } -} - -/* - * Check if a set of events can all go on the PMU at once. - * If they can't, this will look at alternative codes for the events - * and see if any combination of alternative codes is feasible. - * The feasible set is returned in event[]. - */ -static int power_check_constraints(struct cpu_hw_counters *cpuhw, - u64 event[], unsigned int cflags[], - int n_ev) -{ - unsigned long mask, value, nv; - unsigned long smasks[MAX_HWCOUNTERS], svalues[MAX_HWCOUNTERS]; - int n_alt[MAX_HWCOUNTERS], choice[MAX_HWCOUNTERS]; - int i, j; - unsigned long addf = ppmu->add_fields; - unsigned long tadd = ppmu->test_adder; - - if (n_ev > ppmu->n_counter) - return -1; - - /* First see if the events will go on as-is */ - for (i = 0; i < n_ev; ++i) { - if ((cflags[i] & PPMU_LIMITED_PMC_REQD) - && !ppmu->limited_pmc_event(event[i])) { - ppmu->get_alternatives(event[i], cflags[i], - cpuhw->alternatives[i]); - event[i] = cpuhw->alternatives[i][0]; - } - if (ppmu->get_constraint(event[i], &cpuhw->amasks[i][0], - &cpuhw->avalues[i][0])) - return -1; - } - value = mask = 0; - for (i = 0; i < n_ev; ++i) { - nv = (value | cpuhw->avalues[i][0]) + - (value & cpuhw->avalues[i][0] & addf); - if ((((nv + tadd) ^ value) & mask) != 0 || - (((nv + tadd) ^ cpuhw->avalues[i][0]) & - cpuhw->amasks[i][0]) != 0) - break; - value = nv; - mask |= cpuhw->amasks[i][0]; - } - if (i == n_ev) - return 0; /* all OK */ - - /* doesn't work, gather alternatives... */ - if (!ppmu->get_alternatives) - return -1; - for (i = 0; i < n_ev; ++i) { - choice[i] = 0; - n_alt[i] = ppmu->get_alternatives(event[i], cflags[i], - cpuhw->alternatives[i]); - for (j = 1; j < n_alt[i]; ++j) - ppmu->get_constraint(cpuhw->alternatives[i][j], - &cpuhw->amasks[i][j], - &cpuhw->avalues[i][j]); - } - - /* enumerate all possibilities and see if any will work */ - i = 0; - j = -1; - value = mask = nv = 0; - while (i < n_ev) { - if (j >= 0) { - /* we're backtracking, restore context */ - value = svalues[i]; - mask = smasks[i]; - j = choice[i]; - } - /* - * See if any alternative k for event i, - * where k > j, will satisfy the constraints. - */ - while (++j < n_alt[i]) { - nv = (value | cpuhw->avalues[i][j]) + - (value & cpuhw->avalues[i][j] & addf); - if ((((nv + tadd) ^ value) & mask) == 0 && - (((nv + tadd) ^ cpuhw->avalues[i][j]) - & cpuhw->amasks[i][j]) == 0) - break; - } - if (j >= n_alt[i]) { - /* - * No feasible alternative, backtrack - * to event i-1 and continue enumerating its - * alternatives from where we got up to. - */ - if (--i < 0) - return -1; - } else { - /* - * Found a feasible alternative for event i, - * remember where we got up to with this event, - * go on to the next event, and start with - * the first alternative for it. - */ - choice[i] = j; - svalues[i] = value; - smasks[i] = mask; - value = nv; - mask |= cpuhw->amasks[i][j]; - ++i; - j = -1; - } - } - - /* OK, we have a feasible combination, tell the caller the solution */ - for (i = 0; i < n_ev; ++i) - event[i] = cpuhw->alternatives[i][choice[i]]; - return 0; -} - -/* - * Check if newly-added counters have consistent settings for - * exclude_{user,kernel,hv} with each other and any previously - * added counters. - */ -static int check_excludes(struct perf_counter **ctrs, unsigned int cflags[], - int n_prev, int n_new) -{ - int eu = 0, ek = 0, eh = 0; - int i, n, first; - struct perf_counter *counter; - - n = n_prev + n_new; - if (n <= 1) - return 0; - - first = 1; - for (i = 0; i < n; ++i) { - if (cflags[i] & PPMU_LIMITED_PMC_OK) { - cflags[i] &= ~PPMU_LIMITED_PMC_REQD; - continue; - } - counter = ctrs[i]; - if (first) { - eu = counter->attr.exclude_user; - ek = counter->attr.exclude_kernel; - eh = counter->attr.exclude_hv; - first = 0; - } else if (counter->attr.exclude_user != eu || - counter->attr.exclude_kernel != ek || - counter->attr.exclude_hv != eh) { - return -EAGAIN; - } - } - - if (eu || ek || eh) - for (i = 0; i < n; ++i) - if (cflags[i] & PPMU_LIMITED_PMC_OK) - cflags[i] |= PPMU_LIMITED_PMC_REQD; - - return 0; -} - -static void power_pmu_read(struct perf_counter *counter) -{ - s64 val, delta, prev; - - if (!counter->hw.idx) - return; - /* - * Performance monitor interrupts come even when interrupts - * are soft-disabled, as long as interrupts are hard-enabled. - * Therefore we treat them like NMIs. - */ - do { - prev = atomic64_read(&counter->hw.prev_count); - barrier(); - val = read_pmc(counter->hw.idx); - } while (atomic64_cmpxchg(&counter->hw.prev_count, prev, val) != prev); - - /* The counters are only 32 bits wide */ - delta = (val - prev) & 0xfffffffful; - atomic64_add(delta, &counter->count); - atomic64_sub(delta, &counter->hw.period_left); -} - -/* - * On some machines, PMC5 and PMC6 can't be written, don't respect - * the freeze conditions, and don't generate interrupts. This tells - * us if `counter' is using such a PMC. - */ -static int is_limited_pmc(int pmcnum) -{ - return (ppmu->flags & PPMU_LIMITED_PMC5_6) - && (pmcnum == 5 || pmcnum == 6); -} - -static void freeze_limited_counters(struct cpu_hw_counters *cpuhw, - unsigned long pmc5, unsigned long pmc6) -{ - struct perf_counter *counter; - u64 val, prev, delta; - int i; - - for (i = 0; i < cpuhw->n_limited; ++i) { - counter = cpuhw->limited_counter[i]; - if (!counter->hw.idx) - continue; - val = (counter->hw.idx == 5) ? pmc5 : pmc6; - prev = atomic64_read(&counter->hw.prev_count); - counter->hw.idx = 0; - delta = (val - prev) & 0xfffffffful; - atomic64_add(delta, &counter->count); - } -} - -static void thaw_limited_counters(struct cpu_hw_counters *cpuhw, - unsigned long pmc5, unsigned long pmc6) -{ - struct perf_counter *counter; - u64 val; - int i; - - for (i = 0; i < cpuhw->n_limited; ++i) { - counter = cpuhw->limited_counter[i]; - counter->hw.idx = cpuhw->limited_hwidx[i]; - val = (counter->hw.idx == 5) ? pmc5 : pmc6; - atomic64_set(&counter->hw.prev_count, val); - perf_counter_update_userpage(counter); - } -} - -/* - * Since limited counters don't respect the freeze conditions, we - * have to read them immediately after freezing or unfreezing the - * other counters. We try to keep the values from the limited - * counters as consistent as possible by keeping the delay (in - * cycles and instructions) between freezing/unfreezing and reading - * the limited counters as small and consistent as possible. - * Therefore, if any limited counters are in use, we read them - * both, and always in the same order, to minimize variability, - * and do it inside the same asm that writes MMCR0. - */ -static void write_mmcr0(struct cpu_hw_counters *cpuhw, unsigned long mmcr0) -{ - unsigned long pmc5, pmc6; - - if (!cpuhw->n_limited) { - mtspr(SPRN_MMCR0, mmcr0); - return; - } - - /* - * Write MMCR0, then read PMC5 and PMC6 immediately. - * To ensure we don't get a performance monitor interrupt - * between writing MMCR0 and freezing/thawing the limited - * counters, we first write MMCR0 with the counter overflow - * interrupt enable bits turned off. - */ - asm volatile("mtspr %3,%2; mfspr %0,%4; mfspr %1,%5" - : "=&r" (pmc5), "=&r" (pmc6) - : "r" (mmcr0 & ~(MMCR0_PMC1CE | MMCR0_PMCjCE)), - "i" (SPRN_MMCR0), - "i" (SPRN_PMC5), "i" (SPRN_PMC6)); - - if (mmcr0 & MMCR0_FC) - freeze_limited_counters(cpuhw, pmc5, pmc6); - else - thaw_limited_counters(cpuhw, pmc5, pmc6); - - /* - * Write the full MMCR0 including the counter overflow interrupt - * enable bits, if necessary. - */ - if (mmcr0 & (MMCR0_PMC1CE | MMCR0_PMCjCE)) - mtspr(SPRN_MMCR0, mmcr0); -} - -/* - * Disable all counters to prevent PMU interrupts and to allow - * counters to be added or removed. - */ -void hw_perf_disable(void) -{ - struct cpu_hw_counters *cpuhw; - unsigned long flags; - - if (!ppmu) - return; - local_irq_save(flags); - cpuhw = &__get_cpu_var(cpu_hw_counters); - - if (!cpuhw->disabled) { - cpuhw->disabled = 1; - cpuhw->n_added = 0; - - /* - * Check if we ever enabled the PMU on this cpu. - */ - if (!cpuhw->pmcs_enabled) { - ppc_enable_pmcs(); - cpuhw->pmcs_enabled = 1; - } - - /* - * Disable instruction sampling if it was enabled - */ - if (cpuhw->mmcr[2] & MMCRA_SAMPLE_ENABLE) { - mtspr(SPRN_MMCRA, - cpuhw->mmcr[2] & ~MMCRA_SAMPLE_ENABLE); - mb(); - } - - /* - * Set the 'freeze counters' bit. - * The barrier is to make sure the mtspr has been - * executed and the PMU has frozen the counters - * before we return. - */ - write_mmcr0(cpuhw, mfspr(SPRN_MMCR0) | MMCR0_FC); - mb(); - } - local_irq_restore(flags); -} - -/* - * Re-enable all counters if disable == 0. - * If we were previously disabled and counters were added, then - * put the new config on the PMU. - */ -void hw_perf_enable(void) -{ - struct perf_counter *counter; - struct cpu_hw_counters *cpuhw; - unsigned long flags; - long i; - unsigned long val; - s64 left; - unsigned int hwc_index[MAX_HWCOUNTERS]; - int n_lim; - int idx; - - if (!ppmu) - return; - local_irq_save(flags); - cpuhw = &__get_cpu_var(cpu_hw_counters); - if (!cpuhw->disabled) { - local_irq_restore(flags); - return; - } - cpuhw->disabled = 0; - - /* - * If we didn't change anything, or only removed counters, - * no need to recalculate MMCR* settings and reset the PMCs. - * Just reenable the PMU with the current MMCR* settings - * (possibly updated for removal of counters). - */ - if (!cpuhw->n_added) { - mtspr(SPRN_MMCRA, cpuhw->mmcr[2] & ~MMCRA_SAMPLE_ENABLE); - mtspr(SPRN_MMCR1, cpuhw->mmcr[1]); - if (cpuhw->n_counters == 0) - ppc_set_pmu_inuse(0); - goto out_enable; - } - - /* - * Compute MMCR* values for the new set of counters - */ - if (ppmu->compute_mmcr(cpuhw->events, cpuhw->n_counters, hwc_index, - cpuhw->mmcr)) { - /* shouldn't ever get here */ - printk(KERN_ERR "oops compute_mmcr failed\n"); - goto out; - } - - /* - * Add in MMCR0 freeze bits corresponding to the - * attr.exclude_* bits for the first counter. - * We have already checked that all counters have the - * same values for these bits as the first counter. - */ - counter = cpuhw->counter[0]; - if (counter->attr.exclude_user) - cpuhw->mmcr[0] |= MMCR0_FCP; - if (counter->attr.exclude_kernel) - cpuhw->mmcr[0] |= freeze_counters_kernel; - if (counter->attr.exclude_hv) - cpuhw->mmcr[0] |= MMCR0_FCHV; - - /* - * Write the new configuration to MMCR* with the freeze - * bit set and set the hardware counters to their initial values. - * Then unfreeze the counters. - */ - ppc_set_pmu_inuse(1); - mtspr(SPRN_MMCRA, cpuhw->mmcr[2] & ~MMCRA_SAMPLE_ENABLE); - mtspr(SPRN_MMCR1, cpuhw->mmcr[1]); - mtspr(SPRN_MMCR0, (cpuhw->mmcr[0] & ~(MMCR0_PMC1CE | MMCR0_PMCjCE)) - | MMCR0_FC); - - /* - * Read off any pre-existing counters that need to move - * to another PMC. - */ - for (i = 0; i < cpuhw->n_counters; ++i) { - counter = cpuhw->counter[i]; - if (counter->hw.idx && counter->hw.idx != hwc_index[i] + 1) { - power_pmu_read(counter); - write_pmc(counter->hw.idx, 0); - counter->hw.idx = 0; - } - } - - /* - * Initialize the PMCs for all the new and moved counters. - */ - cpuhw->n_limited = n_lim = 0; - for (i = 0; i < cpuhw->n_counters; ++i) { - counter = cpuhw->counter[i]; - if (counter->hw.idx) - continue; - idx = hwc_index[i] + 1; - if (is_limited_pmc(idx)) { - cpuhw->limited_counter[n_lim] = counter; - cpuhw->limited_hwidx[n_lim] = idx; - ++n_lim; - continue; - } - val = 0; - if (counter->hw.sample_period) { - left = atomic64_read(&counter->hw.period_left); - if (left < 0x80000000L) - val = 0x80000000L - left; - } - atomic64_set(&counter->hw.prev_count, val); - counter->hw.idx = idx; - write_pmc(idx, val); - perf_counter_update_userpage(counter); - } - cpuhw->n_limited = n_lim; - cpuhw->mmcr[0] |= MMCR0_PMXE | MMCR0_FCECE; - - out_enable: - mb(); - write_mmcr0(cpuhw, cpuhw->mmcr[0]); - - /* - * Enable instruction sampling if necessary - */ - if (cpuhw->mmcr[2] & MMCRA_SAMPLE_ENABLE) { - mb(); - mtspr(SPRN_MMCRA, cpuhw->mmcr[2]); - } - - out: - local_irq_restore(flags); -} - -static int collect_events(struct perf_counter *group, int max_count, - struct perf_counter *ctrs[], u64 *events, - unsigned int *flags) -{ - int n = 0; - struct perf_counter *counter; - - if (!is_software_counter(group)) { - if (n >= max_count) - return -1; - ctrs[n] = group; - flags[n] = group->hw.counter_base; - events[n++] = group->hw.config; - } - list_for_each_entry(counter, &group->sibling_list, list_entry) { - if (!is_software_counter(counter) && - counter->state != PERF_COUNTER_STATE_OFF) { - if (n >= max_count) - return -1; - ctrs[n] = counter; - flags[n] = counter->hw.counter_base; - events[n++] = counter->hw.config; - } - } - return n; -} - -static void counter_sched_in(struct perf_counter *counter, int cpu) -{ - counter->state = PERF_COUNTER_STATE_ACTIVE; - counter->oncpu = cpu; - counter->tstamp_running += counter->ctx->time - counter->tstamp_stopped; - if (is_software_counter(counter)) - counter->pmu->enable(counter); -} - -/* - * Called to enable a whole group of counters. - * Returns 1 if the group was enabled, or -EAGAIN if it could not be. - * Assumes the caller has disabled interrupts and has - * frozen the PMU with hw_perf_save_disable. - */ -int hw_perf_group_sched_in(struct perf_counter *group_leader, - struct perf_cpu_context *cpuctx, - struct perf_counter_context *ctx, int cpu) -{ - struct cpu_hw_counters *cpuhw; - long i, n, n0; - struct perf_counter *sub; - - if (!ppmu) - return 0; - cpuhw = &__get_cpu_var(cpu_hw_counters); - n0 = cpuhw->n_counters; - n = collect_events(group_leader, ppmu->n_counter - n0, - &cpuhw->counter[n0], &cpuhw->events[n0], - &cpuhw->flags[n0]); - if (n < 0) - return -EAGAIN; - if (check_excludes(cpuhw->counter, cpuhw->flags, n0, n)) - return -EAGAIN; - i = power_check_constraints(cpuhw, cpuhw->events, cpuhw->flags, n + n0); - if (i < 0) - return -EAGAIN; - cpuhw->n_counters = n0 + n; - cpuhw->n_added += n; - - /* - * OK, this group can go on; update counter states etc., - * and enable any software counters - */ - for (i = n0; i < n0 + n; ++i) - cpuhw->counter[i]->hw.config = cpuhw->events[i]; - cpuctx->active_oncpu += n; - n = 1; - counter_sched_in(group_leader, cpu); - list_for_each_entry(sub, &group_leader->sibling_list, list_entry) { - if (sub->state != PERF_COUNTER_STATE_OFF) { - counter_sched_in(sub, cpu); - ++n; - } - } - ctx->nr_active += n; - - return 1; -} - -/* - * Add a counter to the PMU. - * If all counters are not already frozen, then we disable and - * re-enable the PMU in order to get hw_perf_enable to do the - * actual work of reconfiguring the PMU. - */ -static int power_pmu_enable(struct perf_counter *counter) -{ - struct cpu_hw_counters *cpuhw; - unsigned long flags; - int n0; - int ret = -EAGAIN; - - local_irq_save(flags); - perf_disable(); - - /* - * Add the counter to the list (if there is room) - * and check whether the total set is still feasible. - */ - cpuhw = &__get_cpu_var(cpu_hw_counters); - n0 = cpuhw->n_counters; - if (n0 >= ppmu->n_counter) - goto out; - cpuhw->counter[n0] = counter; - cpuhw->events[n0] = counter->hw.config; - cpuhw->flags[n0] = counter->hw.counter_base; - if (check_excludes(cpuhw->counter, cpuhw->flags, n0, 1)) - goto out; - if (power_check_constraints(cpuhw, cpuhw->events, cpuhw->flags, n0 + 1)) - goto out; - - counter->hw.config = cpuhw->events[n0]; - ++cpuhw->n_counters; - ++cpuhw->n_added; - - ret = 0; - out: - perf_enable(); - local_irq_restore(flags); - return ret; -} - -/* - * Remove a counter from the PMU. - */ -static void power_pmu_disable(struct perf_counter *counter) -{ - struct cpu_hw_counters *cpuhw; - long i; - unsigned long flags; - - local_irq_save(flags); - perf_disable(); - - power_pmu_read(counter); - - cpuhw = &__get_cpu_var(cpu_hw_counters); - for (i = 0; i < cpuhw->n_counters; ++i) { - if (counter == cpuhw->counter[i]) { - while (++i < cpuhw->n_counters) - cpuhw->counter[i-1] = cpuhw->counter[i]; - --cpuhw->n_counters; - ppmu->disable_pmc(counter->hw.idx - 1, cpuhw->mmcr); - if (counter->hw.idx) { - write_pmc(counter->hw.idx, 0); - counter->hw.idx = 0; - } - perf_counter_update_userpage(counter); - break; - } - } - for (i = 0; i < cpuhw->n_limited; ++i) - if (counter == cpuhw->limited_counter[i]) - break; - if (i < cpuhw->n_limited) { - while (++i < cpuhw->n_limited) { - cpuhw->limited_counter[i-1] = cpuhw->limited_counter[i]; - cpuhw->limited_hwidx[i-1] = cpuhw->limited_hwidx[i]; - } - --cpuhw->n_limited; - } - if (cpuhw->n_counters == 0) { - /* disable exceptions if no counters are running */ - cpuhw->mmcr[0] &= ~(MMCR0_PMXE | MMCR0_FCECE); - } - - perf_enable(); - local_irq_restore(flags); -} - -/* - * Re-enable interrupts on a counter after they were throttled - * because they were coming too fast. - */ -static void power_pmu_unthrottle(struct perf_counter *counter) -{ - s64 val, left; - unsigned long flags; - - if (!counter->hw.idx || !counter->hw.sample_period) - return; - local_irq_save(flags); - perf_disable(); - power_pmu_read(counter); - left = counter->hw.sample_period; - counter->hw.last_period = left; - val = 0; - if (left < 0x80000000L) - val = 0x80000000L - left; - write_pmc(counter->hw.idx, val); - atomic64_set(&counter->hw.prev_count, val); - atomic64_set(&counter->hw.period_left, left); - perf_counter_update_userpage(counter); - perf_enable(); - local_irq_restore(flags); -} - -struct pmu power_pmu = { - .enable = power_pmu_enable, - .disable = power_pmu_disable, - .read = power_pmu_read, - .unthrottle = power_pmu_unthrottle, -}; - -/* - * Return 1 if we might be able to put counter on a limited PMC, - * or 0 if not. - * A counter can only go on a limited PMC if it counts something - * that a limited PMC can count, doesn't require interrupts, and - * doesn't exclude any processor mode. - */ -static int can_go_on_limited_pmc(struct perf_counter *counter, u64 ev, - unsigned int flags) -{ - int n; - u64 alt[MAX_EVENT_ALTERNATIVES]; - - if (counter->attr.exclude_user - || counter->attr.exclude_kernel - || counter->attr.exclude_hv - || counter->attr.sample_period) - return 0; - - if (ppmu->limited_pmc_event(ev)) - return 1; - - /* - * The requested event isn't on a limited PMC already; - * see if any alternative code goes on a limited PMC. - */ - if (!ppmu->get_alternatives) - return 0; - - flags |= PPMU_LIMITED_PMC_OK | PPMU_LIMITED_PMC_REQD; - n = ppmu->get_alternatives(ev, flags, alt); - - return n > 0; -} - -/* - * Find an alternative event that goes on a normal PMC, if possible, - * and return the event code, or 0 if there is no such alternative. - * (Note: event code 0 is "don't count" on all machines.) - */ -static u64 normal_pmc_alternative(u64 ev, unsigned long flags) -{ - u64 alt[MAX_EVENT_ALTERNATIVES]; - int n; - - flags &= ~(PPMU_LIMITED_PMC_OK | PPMU_LIMITED_PMC_REQD); - n = ppmu->get_alternatives(ev, flags, alt); - if (!n) - return 0; - return alt[0]; -} - -/* Number of perf_counters counting hardware events */ -static atomic_t num_counters; -/* Used to avoid races in calling reserve/release_pmc_hardware */ -static DEFINE_MUTEX(pmc_reserve_mutex); - -/* - * Release the PMU if this is the last perf_counter. - */ -static void hw_perf_counter_destroy(struct perf_counter *counter) -{ - if (!atomic_add_unless(&num_counters, -1, 1)) { - mutex_lock(&pmc_reserve_mutex); - if (atomic_dec_return(&num_counters) == 0) - release_pmc_hardware(); - mutex_unlock(&pmc_reserve_mutex); - } -} - -/* - * Translate a generic cache event config to a raw event code. - */ -static int hw_perf_cache_event(u64 config, u64 *eventp) -{ - unsigned long type, op, result; - int ev; - - if (!ppmu->cache_events) - return -EINVAL; - - /* unpack config */ - type = config & 0xff; - op = (config >> 8) & 0xff; - result = (config >> 16) & 0xff; - - if (type >= PERF_COUNT_HW_CACHE_MAX || - op >= PERF_COUNT_HW_CACHE_OP_MAX || - result >= PERF_COUNT_HW_CACHE_RESULT_MAX) - return -EINVAL; - - ev = (*ppmu->cache_events)[type][op][result]; - if (ev == 0) - return -EOPNOTSUPP; - if (ev == -1) - return -EINVAL; - *eventp = ev; - return 0; -} - -const struct pmu *hw_perf_counter_init(struct perf_counter *counter) -{ - u64 ev; - unsigned long flags; - struct perf_counter *ctrs[MAX_HWCOUNTERS]; - u64 events[MAX_HWCOUNTERS]; - unsigned int cflags[MAX_HWCOUNTERS]; - int n; - int err; - struct cpu_hw_counters *cpuhw; - - if (!ppmu) - return ERR_PTR(-ENXIO); - switch (counter->attr.type) { - case PERF_TYPE_HARDWARE: - ev = counter->attr.config; - if (ev >= ppmu->n_generic || ppmu->generic_events[ev] == 0) - return ERR_PTR(-EOPNOTSUPP); - ev = ppmu->generic_events[ev]; - break; - case PERF_TYPE_HW_CACHE: - err = hw_perf_cache_event(counter->attr.config, &ev); - if (err) - return ERR_PTR(err); - break; - case PERF_TYPE_RAW: - ev = counter->attr.config; - break; - default: - return ERR_PTR(-EINVAL); - } - counter->hw.config_base = ev; - counter->hw.idx = 0; - - /* - * If we are not running on a hypervisor, force the - * exclude_hv bit to 0 so that we don't care what - * the user set it to. - */ - if (!firmware_has_feature(FW_FEATURE_LPAR)) - counter->attr.exclude_hv = 0; - - /* - * If this is a per-task counter, then we can use - * PM_RUN_* events interchangeably with their non RUN_* - * equivalents, e.g. PM_RUN_CYC instead of PM_CYC. - * XXX we should check if the task is an idle task. - */ - flags = 0; - if (counter->ctx->task) - flags |= PPMU_ONLY_COUNT_RUN; - - /* - * If this machine has limited counters, check whether this - * event could go on a limited counter. - */ - if (ppmu->flags & PPMU_LIMITED_PMC5_6) { - if (can_go_on_limited_pmc(counter, ev, flags)) { - flags |= PPMU_LIMITED_PMC_OK; - } else if (ppmu->limited_pmc_event(ev)) { - /* - * The requested event is on a limited PMC, - * but we can't use a limited PMC; see if any - * alternative goes on a normal PMC. - */ - ev = normal_pmc_alternative(ev, flags); - if (!ev) - return ERR_PTR(-EINVAL); - } - } - - /* - * If this is in a group, check if it can go on with all the - * other hardware counters in the group. We assume the counter - * hasn't been linked into its leader's sibling list at this point. - */ - n = 0; - if (counter->group_leader != counter) { - n = collect_events(counter->group_leader, ppmu->n_counter - 1, - ctrs, events, cflags); - if (n < 0) - return ERR_PTR(-EINVAL); - } - events[n] = ev; - ctrs[n] = counter; - cflags[n] = flags; - if (check_excludes(ctrs, cflags, n, 1)) - return ERR_PTR(-EINVAL); - - cpuhw = &get_cpu_var(cpu_hw_counters); - err = power_check_constraints(cpuhw, events, cflags, n + 1); - put_cpu_var(cpu_hw_counters); - if (err) - return ERR_PTR(-EINVAL); - - counter->hw.config = events[n]; - counter->hw.counter_base = cflags[n]; - counter->hw.last_period = counter->hw.sample_period; - atomic64_set(&counter->hw.period_left, counter->hw.last_period); - - /* - * See if we need to reserve the PMU. - * If no counters are currently in use, then we have to take a - * mutex to ensure that we don't race with another task doing - * reserve_pmc_hardware or release_pmc_hardware. - */ - err = 0; - if (!atomic_inc_not_zero(&num_counters)) { - mutex_lock(&pmc_reserve_mutex); - if (atomic_read(&num_counters) == 0 && - reserve_pmc_hardware(perf_counter_interrupt)) - err = -EBUSY; - else - atomic_inc(&num_counters); - mutex_unlock(&pmc_reserve_mutex); - } - counter->destroy = hw_perf_counter_destroy; - - if (err) - return ERR_PTR(err); - return &power_pmu; -} - -/* - * A counter has overflowed; update its count and record - * things if requested. Note that interrupts are hard-disabled - * here so there is no possibility of being interrupted. - */ -static void record_and_restart(struct perf_counter *counter, unsigned long val, - struct pt_regs *regs, int nmi) -{ - u64 period = counter->hw.sample_period; - s64 prev, delta, left; - int record = 0; - - /* we don't have to worry about interrupts here */ - prev = atomic64_read(&counter->hw.prev_count); - delta = (val - prev) & 0xfffffffful; - atomic64_add(delta, &counter->count); - - /* - * See if the total period for this counter has expired, - * and update for the next period. - */ - val = 0; - left = atomic64_read(&counter->hw.period_left) - delta; - if (period) { - if (left <= 0) { - left += period; - if (left <= 0) - left = period; - record = 1; - } - if (left < 0x80000000LL) - val = 0x80000000LL - left; - } - - /* - * Finally record data if requested. - */ - if (record) { - struct perf_sample_data data = { - .addr = 0, - .period = counter->hw.last_period, - }; - - if (counter->attr.sample_type & PERF_SAMPLE_ADDR) - perf_get_data_addr(regs, &data.addr); - - if (perf_counter_overflow(counter, nmi, &data, regs)) { - /* - * Interrupts are coming too fast - throttle them - * by setting the counter to 0, so it will be - * at least 2^30 cycles until the next interrupt - * (assuming each counter counts at most 2 counts - * per cycle). - */ - val = 0; - left = ~0ULL >> 1; - } - } - - write_pmc(counter->hw.idx, val); - atomic64_set(&counter->hw.prev_count, val); - atomic64_set(&counter->hw.period_left, left); - perf_counter_update_userpage(counter); -} - -/* - * Called from generic code to get the misc flags (i.e. processor mode) - * for an event. - */ -unsigned long perf_misc_flags(struct pt_regs *regs) -{ - u32 flags = perf_get_misc_flags(regs); - - if (flags) - return flags; - return user_mode(regs) ? PERF_EVENT_MISC_USER : - PERF_EVENT_MISC_KERNEL; -} - -/* - * Called from generic code to get the instruction pointer - * for an event. - */ -unsigned long perf_instruction_pointer(struct pt_regs *regs) -{ - unsigned long ip; - - if (TRAP(regs) != 0xf00) - return regs->nip; /* not a PMU interrupt */ - - ip = mfspr(SPRN_SIAR) + perf_ip_adjust(regs); - return ip; -} - -/* - * Performance monitor interrupt stuff - */ -static void perf_counter_interrupt(struct pt_regs *regs) -{ - int i; - struct cpu_hw_counters *cpuhw = &__get_cpu_var(cpu_hw_counters); - struct perf_counter *counter; - unsigned long val; - int found = 0; - int nmi; - - if (cpuhw->n_limited) - freeze_limited_counters(cpuhw, mfspr(SPRN_PMC5), - mfspr(SPRN_PMC6)); - - perf_read_regs(regs); - - nmi = perf_intr_is_nmi(regs); - if (nmi) - nmi_enter(); - else - irq_enter(); - - for (i = 0; i < cpuhw->n_counters; ++i) { - counter = cpuhw->counter[i]; - if (!counter->hw.idx || is_limited_pmc(counter->hw.idx)) - continue; - val = read_pmc(counter->hw.idx); - if ((int)val < 0) { - /* counter has overflowed */ - found = 1; - record_and_restart(counter, val, regs, nmi); - } - } - - /* - * In case we didn't find and reset the counter that caused - * the interrupt, scan all counters and reset any that are - * negative, to avoid getting continual interrupts. - * Any that we processed in the previous loop will not be negative. - */ - if (!found) { - for (i = 0; i < ppmu->n_counter; ++i) { - if (is_limited_pmc(i + 1)) - continue; - val = read_pmc(i + 1); - if ((int)val < 0) - write_pmc(i + 1, 0); - } - } - - /* - * Reset MMCR0 to its normal value. This will set PMXE and - * clear FC (freeze counters) and PMAO (perf mon alert occurred) - * and thus allow interrupts to occur again. - * XXX might want to use MSR.PM to keep the counters frozen until - * we get back out of this interrupt. - */ - write_mmcr0(cpuhw, cpuhw->mmcr[0]); - - if (nmi) - nmi_exit(); - else - irq_exit(); -} - -void hw_perf_counter_setup(int cpu) -{ - struct cpu_hw_counters *cpuhw = &per_cpu(cpu_hw_counters, cpu); - - if (!ppmu) - return; - memset(cpuhw, 0, sizeof(*cpuhw)); - cpuhw->mmcr[0] = MMCR0_FC; -} - -int register_power_pmu(struct power_pmu *pmu) -{ - if (ppmu) - return -EBUSY; /* something's already registered */ - - ppmu = pmu; - pr_info("%s performance monitor hardware support registered\n", - pmu->name); - -#ifdef MSR_HV - /* - * Use FCHV to ignore kernel events if MSR.HV is set. - */ - if (mfmsr() & MSR_HV) - freeze_counters_kernel = MMCR0_FCHV; -#endif /* CONFIG_PPC64 */ - - return 0; -} diff --git a/arch/powerpc/kernel/perf_event.c b/arch/powerpc/kernel/perf_event.c new file mode 100644 index 000000000000..c98321fcb459 --- /dev/null +++ b/arch/powerpc/kernel/perf_event.c @@ -0,0 +1,1315 @@ +/* + * Performance event support - powerpc architecture code + * + * Copyright 2008-2009 Paul Mackerras, IBM Corporation. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +struct cpu_hw_events { + int n_events; + int n_percpu; + int disabled; + int n_added; + int n_limited; + u8 pmcs_enabled; + struct perf_event *event[MAX_HWEVENTS]; + u64 events[MAX_HWEVENTS]; + unsigned int flags[MAX_HWEVENTS]; + unsigned long mmcr[3]; + struct perf_event *limited_event[MAX_LIMITED_HWEVENTS]; + u8 limited_hwidx[MAX_LIMITED_HWEVENTS]; + u64 alternatives[MAX_HWEVENTS][MAX_EVENT_ALTERNATIVES]; + unsigned long amasks[MAX_HWEVENTS][MAX_EVENT_ALTERNATIVES]; + unsigned long avalues[MAX_HWEVENTS][MAX_EVENT_ALTERNATIVES]; +}; +DEFINE_PER_CPU(struct cpu_hw_events, cpu_hw_events); + +struct power_pmu *ppmu; + +/* + * Normally, to ignore kernel events we set the FCS (freeze events + * in supervisor mode) bit in MMCR0, but if the kernel runs with the + * hypervisor bit set in the MSR, or if we are running on a processor + * where the hypervisor bit is forced to 1 (as on Apple G5 processors), + * then we need to use the FCHV bit to ignore kernel events. + */ +static unsigned int freeze_events_kernel = MMCR0_FCS; + +/* + * 32-bit doesn't have MMCRA but does have an MMCR2, + * and a few other names are different. + */ +#ifdef CONFIG_PPC32 + +#define MMCR0_FCHV 0 +#define MMCR0_PMCjCE MMCR0_PMCnCE + +#define SPRN_MMCRA SPRN_MMCR2 +#define MMCRA_SAMPLE_ENABLE 0 + +static inline unsigned long perf_ip_adjust(struct pt_regs *regs) +{ + return 0; +} +static inline void perf_get_data_addr(struct pt_regs *regs, u64 *addrp) { } +static inline u32 perf_get_misc_flags(struct pt_regs *regs) +{ + return 0; +} +static inline void perf_read_regs(struct pt_regs *regs) { } +static inline int perf_intr_is_nmi(struct pt_regs *regs) +{ + return 0; +} + +#endif /* CONFIG_PPC32 */ + +/* + * Things that are specific to 64-bit implementations. + */ +#ifdef CONFIG_PPC64 + +static inline unsigned long perf_ip_adjust(struct pt_regs *regs) +{ + unsigned long mmcra = regs->dsisr; + + if ((mmcra & MMCRA_SAMPLE_ENABLE) && !(ppmu->flags & PPMU_ALT_SIPR)) { + unsigned long slot = (mmcra & MMCRA_SLOT) >> MMCRA_SLOT_SHIFT; + if (slot > 1) + return 4 * (slot - 1); + } + return 0; +} + +/* + * The user wants a data address recorded. + * If we're not doing instruction sampling, give them the SDAR + * (sampled data address). If we are doing instruction sampling, then + * only give them the SDAR if it corresponds to the instruction + * pointed to by SIAR; this is indicated by the [POWER6_]MMCRA_SDSYNC + * bit in MMCRA. + */ +static inline void perf_get_data_addr(struct pt_regs *regs, u64 *addrp) +{ + unsigned long mmcra = regs->dsisr; + unsigned long sdsync = (ppmu->flags & PPMU_ALT_SIPR) ? + POWER6_MMCRA_SDSYNC : MMCRA_SDSYNC; + + if (!(mmcra & MMCRA_SAMPLE_ENABLE) || (mmcra & sdsync)) + *addrp = mfspr(SPRN_SDAR); +} + +static inline u32 perf_get_misc_flags(struct pt_regs *regs) +{ + unsigned long mmcra = regs->dsisr; + + if (TRAP(regs) != 0xf00) + return 0; /* not a PMU interrupt */ + + if (ppmu->flags & PPMU_ALT_SIPR) { + if (mmcra & POWER6_MMCRA_SIHV) + return PERF_RECORD_MISC_HYPERVISOR; + return (mmcra & POWER6_MMCRA_SIPR) ? + PERF_RECORD_MISC_USER : PERF_RECORD_MISC_KERNEL; + } + if (mmcra & MMCRA_SIHV) + return PERF_RECORD_MISC_HYPERVISOR; + return (mmcra & MMCRA_SIPR) ? PERF_RECORD_MISC_USER : + PERF_RECORD_MISC_KERNEL; +} + +/* + * Overload regs->dsisr to store MMCRA so we only need to read it once + * on each interrupt. + */ +static inline void perf_read_regs(struct pt_regs *regs) +{ + regs->dsisr = mfspr(SPRN_MMCRA); +} + +/* + * If interrupts were soft-disabled when a PMU interrupt occurs, treat + * it as an NMI. + */ +static inline int perf_intr_is_nmi(struct pt_regs *regs) +{ + return !regs->softe; +} + +#endif /* CONFIG_PPC64 */ + +static void perf_event_interrupt(struct pt_regs *regs); + +void perf_event_print_debug(void) +{ +} + +/* + * Read one performance monitor event (PMC). + */ +static unsigned long read_pmc(int idx) +{ + unsigned long val; + + switch (idx) { + case 1: + val = mfspr(SPRN_PMC1); + break; + case 2: + val = mfspr(SPRN_PMC2); + break; + case 3: + val = mfspr(SPRN_PMC3); + break; + case 4: + val = mfspr(SPRN_PMC4); + break; + case 5: + val = mfspr(SPRN_PMC5); + break; + case 6: + val = mfspr(SPRN_PMC6); + break; +#ifdef CONFIG_PPC64 + case 7: + val = mfspr(SPRN_PMC7); + break; + case 8: + val = mfspr(SPRN_PMC8); + break; +#endif /* CONFIG_PPC64 */ + default: + printk(KERN_ERR "oops trying to read PMC%d\n", idx); + val = 0; + } + return val; +} + +/* + * Write one PMC. + */ +static void write_pmc(int idx, unsigned long val) +{ + switch (idx) { + case 1: + mtspr(SPRN_PMC1, val); + break; + case 2: + mtspr(SPRN_PMC2, val); + break; + case 3: + mtspr(SPRN_PMC3, val); + break; + case 4: + mtspr(SPRN_PMC4, val); + break; + case 5: + mtspr(SPRN_PMC5, val); + break; + case 6: + mtspr(SPRN_PMC6, val); + break; +#ifdef CONFIG_PPC64 + case 7: + mtspr(SPRN_PMC7, val); + break; + case 8: + mtspr(SPRN_PMC8, val); + break; +#endif /* CONFIG_PPC64 */ + default: + printk(KERN_ERR "oops trying to write PMC%d\n", idx); + } +} + +/* + * Check if a set of events can all go on the PMU at once. + * If they can't, this will look at alternative codes for the events + * and see if any combination of alternative codes is feasible. + * The feasible set is returned in event_id[]. + */ +static int power_check_constraints(struct cpu_hw_events *cpuhw, + u64 event_id[], unsigned int cflags[], + int n_ev) +{ + unsigned long mask, value, nv; + unsigned long smasks[MAX_HWEVENTS], svalues[MAX_HWEVENTS]; + int n_alt[MAX_HWEVENTS], choice[MAX_HWEVENTS]; + int i, j; + unsigned long addf = ppmu->add_fields; + unsigned long tadd = ppmu->test_adder; + + if (n_ev > ppmu->n_event) + return -1; + + /* First see if the events will go on as-is */ + for (i = 0; i < n_ev; ++i) { + if ((cflags[i] & PPMU_LIMITED_PMC_REQD) + && !ppmu->limited_pmc_event(event_id[i])) { + ppmu->get_alternatives(event_id[i], cflags[i], + cpuhw->alternatives[i]); + event_id[i] = cpuhw->alternatives[i][0]; + } + if (ppmu->get_constraint(event_id[i], &cpuhw->amasks[i][0], + &cpuhw->avalues[i][0])) + return -1; + } + value = mask = 0; + for (i = 0; i < n_ev; ++i) { + nv = (value | cpuhw->avalues[i][0]) + + (value & cpuhw->avalues[i][0] & addf); + if ((((nv + tadd) ^ value) & mask) != 0 || + (((nv + tadd) ^ cpuhw->avalues[i][0]) & + cpuhw->amasks[i][0]) != 0) + break; + value = nv; + mask |= cpuhw->amasks[i][0]; + } + if (i == n_ev) + return 0; /* all OK */ + + /* doesn't work, gather alternatives... */ + if (!ppmu->get_alternatives) + return -1; + for (i = 0; i < n_ev; ++i) { + choice[i] = 0; + n_alt[i] = ppmu->get_alternatives(event_id[i], cflags[i], + cpuhw->alternatives[i]); + for (j = 1; j < n_alt[i]; ++j) + ppmu->get_constraint(cpuhw->alternatives[i][j], + &cpuhw->amasks[i][j], + &cpuhw->avalues[i][j]); + } + + /* enumerate all possibilities and see if any will work */ + i = 0; + j = -1; + value = mask = nv = 0; + while (i < n_ev) { + if (j >= 0) { + /* we're backtracking, restore context */ + value = svalues[i]; + mask = smasks[i]; + j = choice[i]; + } + /* + * See if any alternative k for event_id i, + * where k > j, will satisfy the constraints. + */ + while (++j < n_alt[i]) { + nv = (value | cpuhw->avalues[i][j]) + + (value & cpuhw->avalues[i][j] & addf); + if ((((nv + tadd) ^ value) & mask) == 0 && + (((nv + tadd) ^ cpuhw->avalues[i][j]) + & cpuhw->amasks[i][j]) == 0) + break; + } + if (j >= n_alt[i]) { + /* + * No feasible alternative, backtrack + * to event_id i-1 and continue enumerating its + * alternatives from where we got up to. + */ + if (--i < 0) + return -1; + } else { + /* + * Found a feasible alternative for event_id i, + * remember where we got up to with this event_id, + * go on to the next event_id, and start with + * the first alternative for it. + */ + choice[i] = j; + svalues[i] = value; + smasks[i] = mask; + value = nv; + mask |= cpuhw->amasks[i][j]; + ++i; + j = -1; + } + } + + /* OK, we have a feasible combination, tell the caller the solution */ + for (i = 0; i < n_ev; ++i) + event_id[i] = cpuhw->alternatives[i][choice[i]]; + return 0; +} + +/* + * Check if newly-added events have consistent settings for + * exclude_{user,kernel,hv} with each other and any previously + * added events. + */ +static int check_excludes(struct perf_event **ctrs, unsigned int cflags[], + int n_prev, int n_new) +{ + int eu = 0, ek = 0, eh = 0; + int i, n, first; + struct perf_event *event; + + n = n_prev + n_new; + if (n <= 1) + return 0; + + first = 1; + for (i = 0; i < n; ++i) { + if (cflags[i] & PPMU_LIMITED_PMC_OK) { + cflags[i] &= ~PPMU_LIMITED_PMC_REQD; + continue; + } + event = ctrs[i]; + if (first) { + eu = event->attr.exclude_user; + ek = event->attr.exclude_kernel; + eh = event->attr.exclude_hv; + first = 0; + } else if (event->attr.exclude_user != eu || + event->attr.exclude_kernel != ek || + event->attr.exclude_hv != eh) { + return -EAGAIN; + } + } + + if (eu || ek || eh) + for (i = 0; i < n; ++i) + if (cflags[i] & PPMU_LIMITED_PMC_OK) + cflags[i] |= PPMU_LIMITED_PMC_REQD; + + return 0; +} + +static void power_pmu_read(struct perf_event *event) +{ + s64 val, delta, prev; + + if (!event->hw.idx) + return; + /* + * Performance monitor interrupts come even when interrupts + * are soft-disabled, as long as interrupts are hard-enabled. + * Therefore we treat them like NMIs. + */ + do { + prev = atomic64_read(&event->hw.prev_count); + barrier(); + val = read_pmc(event->hw.idx); + } while (atomic64_cmpxchg(&event->hw.prev_count, prev, val) != prev); + + /* The events are only 32 bits wide */ + delta = (val - prev) & 0xfffffffful; + atomic64_add(delta, &event->count); + atomic64_sub(delta, &event->hw.period_left); +} + +/* + * On some machines, PMC5 and PMC6 can't be written, don't respect + * the freeze conditions, and don't generate interrupts. This tells + * us if `event' is using such a PMC. + */ +static int is_limited_pmc(int pmcnum) +{ + return (ppmu->flags & PPMU_LIMITED_PMC5_6) + && (pmcnum == 5 || pmcnum == 6); +} + +static void freeze_limited_events(struct cpu_hw_events *cpuhw, + unsigned long pmc5, unsigned long pmc6) +{ + struct perf_event *event; + u64 val, prev, delta; + int i; + + for (i = 0; i < cpuhw->n_limited; ++i) { + event = cpuhw->limited_event[i]; + if (!event->hw.idx) + continue; + val = (event->hw.idx == 5) ? pmc5 : pmc6; + prev = atomic64_read(&event->hw.prev_count); + event->hw.idx = 0; + delta = (val - prev) & 0xfffffffful; + atomic64_add(delta, &event->count); + } +} + +static void thaw_limited_events(struct cpu_hw_events *cpuhw, + unsigned long pmc5, unsigned long pmc6) +{ + struct perf_event *event; + u64 val; + int i; + + for (i = 0; i < cpuhw->n_limited; ++i) { + event = cpuhw->limited_event[i]; + event->hw.idx = cpuhw->limited_hwidx[i]; + val = (event->hw.idx == 5) ? pmc5 : pmc6; + atomic64_set(&event->hw.prev_count, val); + perf_event_update_userpage(event); + } +} + +/* + * Since limited events don't respect the freeze conditions, we + * have to read them immediately after freezing or unfreezing the + * other events. We try to keep the values from the limited + * events as consistent as possible by keeping the delay (in + * cycles and instructions) between freezing/unfreezing and reading + * the limited events as small and consistent as possible. + * Therefore, if any limited events are in use, we read them + * both, and always in the same order, to minimize variability, + * and do it inside the same asm that writes MMCR0. + */ +static void write_mmcr0(struct cpu_hw_events *cpuhw, unsigned long mmcr0) +{ + unsigned long pmc5, pmc6; + + if (!cpuhw->n_limited) { + mtspr(SPRN_MMCR0, mmcr0); + return; + } + + /* + * Write MMCR0, then read PMC5 and PMC6 immediately. + * To ensure we don't get a performance monitor interrupt + * between writing MMCR0 and freezing/thawing the limited + * events, we first write MMCR0 with the event overflow + * interrupt enable bits turned off. + */ + asm volatile("mtspr %3,%2; mfspr %0,%4; mfspr %1,%5" + : "=&r" (pmc5), "=&r" (pmc6) + : "r" (mmcr0 & ~(MMCR0_PMC1CE | MMCR0_PMCjCE)), + "i" (SPRN_MMCR0), + "i" (SPRN_PMC5), "i" (SPRN_PMC6)); + + if (mmcr0 & MMCR0_FC) + freeze_limited_events(cpuhw, pmc5, pmc6); + else + thaw_limited_events(cpuhw, pmc5, pmc6); + + /* + * Write the full MMCR0 including the event overflow interrupt + * enable bits, if necessary. + */ + if (mmcr0 & (MMCR0_PMC1CE | MMCR0_PMCjCE)) + mtspr(SPRN_MMCR0, mmcr0); +} + +/* + * Disable all events to prevent PMU interrupts and to allow + * events to be added or removed. + */ +void hw_perf_disable(void) +{ + struct cpu_hw_events *cpuhw; + unsigned long flags; + + if (!ppmu) + return; + local_irq_save(flags); + cpuhw = &__get_cpu_var(cpu_hw_events); + + if (!cpuhw->disabled) { + cpuhw->disabled = 1; + cpuhw->n_added = 0; + + /* + * Check if we ever enabled the PMU on this cpu. + */ + if (!cpuhw->pmcs_enabled) { + ppc_enable_pmcs(); + cpuhw->pmcs_enabled = 1; + } + + /* + * Disable instruction sampling if it was enabled + */ + if (cpuhw->mmcr[2] & MMCRA_SAMPLE_ENABLE) { + mtspr(SPRN_MMCRA, + cpuhw->mmcr[2] & ~MMCRA_SAMPLE_ENABLE); + mb(); + } + + /* + * Set the 'freeze events' bit. + * The barrier is to make sure the mtspr has been + * executed and the PMU has frozen the events + * before we return. + */ + write_mmcr0(cpuhw, mfspr(SPRN_MMCR0) | MMCR0_FC); + mb(); + } + local_irq_restore(flags); +} + +/* + * Re-enable all events if disable == 0. + * If we were previously disabled and events were added, then + * put the new config on the PMU. + */ +void hw_perf_enable(void) +{ + struct perf_event *event; + struct cpu_hw_events *cpuhw; + unsigned long flags; + long i; + unsigned long val; + s64 left; + unsigned int hwc_index[MAX_HWEVENTS]; + int n_lim; + int idx; + + if (!ppmu) + return; + local_irq_save(flags); + cpuhw = &__get_cpu_var(cpu_hw_events); + if (!cpuhw->disabled) { + local_irq_restore(flags); + return; + } + cpuhw->disabled = 0; + + /* + * If we didn't change anything, or only removed events, + * no need to recalculate MMCR* settings and reset the PMCs. + * Just reenable the PMU with the current MMCR* settings + * (possibly updated for removal of events). + */ + if (!cpuhw->n_added) { + mtspr(SPRN_MMCRA, cpuhw->mmcr[2] & ~MMCRA_SAMPLE_ENABLE); + mtspr(SPRN_MMCR1, cpuhw->mmcr[1]); + if (cpuhw->n_events == 0) + ppc_set_pmu_inuse(0); + goto out_enable; + } + + /* + * Compute MMCR* values for the new set of events + */ + if (ppmu->compute_mmcr(cpuhw->events, cpuhw->n_events, hwc_index, + cpuhw->mmcr)) { + /* shouldn't ever get here */ + printk(KERN_ERR "oops compute_mmcr failed\n"); + goto out; + } + + /* + * Add in MMCR0 freeze bits corresponding to the + * attr.exclude_* bits for the first event. + * We have already checked that all events have the + * same values for these bits as the first event. + */ + event = cpuhw->event[0]; + if (event->attr.exclude_user) + cpuhw->mmcr[0] |= MMCR0_FCP; + if (event->attr.exclude_kernel) + cpuhw->mmcr[0] |= freeze_events_kernel; + if (event->attr.exclude_hv) + cpuhw->mmcr[0] |= MMCR0_FCHV; + + /* + * Write the new configuration to MMCR* with the freeze + * bit set and set the hardware events to their initial values. + * Then unfreeze the events. + */ + ppc_set_pmu_inuse(1); + mtspr(SPRN_MMCRA, cpuhw->mmcr[2] & ~MMCRA_SAMPLE_ENABLE); + mtspr(SPRN_MMCR1, cpuhw->mmcr[1]); + mtspr(SPRN_MMCR0, (cpuhw->mmcr[0] & ~(MMCR0_PMC1CE | MMCR0_PMCjCE)) + | MMCR0_FC); + + /* + * Read off any pre-existing events that need to move + * to another PMC. + */ + for (i = 0; i < cpuhw->n_events; ++i) { + event = cpuhw->event[i]; + if (event->hw.idx && event->hw.idx != hwc_index[i] + 1) { + power_pmu_read(event); + write_pmc(event->hw.idx, 0); + event->hw.idx = 0; + } + } + + /* + * Initialize the PMCs for all the new and moved events. + */ + cpuhw->n_limited = n_lim = 0; + for (i = 0; i < cpuhw->n_events; ++i) { + event = cpuhw->event[i]; + if (event->hw.idx) + continue; + idx = hwc_index[i] + 1; + if (is_limited_pmc(idx)) { + cpuhw->limited_event[n_lim] = event; + cpuhw->limited_hwidx[n_lim] = idx; + ++n_lim; + continue; + } + val = 0; + if (event->hw.sample_period) { + left = atomic64_read(&event->hw.period_left); + if (left < 0x80000000L) + val = 0x80000000L - left; + } + atomic64_set(&event->hw.prev_count, val); + event->hw.idx = idx; + write_pmc(idx, val); + perf_event_update_userpage(event); + } + cpuhw->n_limited = n_lim; + cpuhw->mmcr[0] |= MMCR0_PMXE | MMCR0_FCECE; + + out_enable: + mb(); + write_mmcr0(cpuhw, cpuhw->mmcr[0]); + + /* + * Enable instruction sampling if necessary + */ + if (cpuhw->mmcr[2] & MMCRA_SAMPLE_ENABLE) { + mb(); + mtspr(SPRN_MMCRA, cpuhw->mmcr[2]); + } + + out: + local_irq_restore(flags); +} + +static int collect_events(struct perf_event *group, int max_count, + struct perf_event *ctrs[], u64 *events, + unsigned int *flags) +{ + int n = 0; + struct perf_event *event; + + if (!is_software_event(group)) { + if (n >= max_count) + return -1; + ctrs[n] = group; + flags[n] = group->hw.event_base; + events[n++] = group->hw.config; + } + list_for_each_entry(event, &group->sibling_list, list_entry) { + if (!is_software_event(event) && + event->state != PERF_EVENT_STATE_OFF) { + if (n >= max_count) + return -1; + ctrs[n] = event; + flags[n] = event->hw.event_base; + events[n++] = event->hw.config; + } + } + return n; +} + +static void event_sched_in(struct perf_event *event, int cpu) +{ + event->state = PERF_EVENT_STATE_ACTIVE; + event->oncpu = cpu; + event->tstamp_running += event->ctx->time - event->tstamp_stopped; + if (is_software_event(event)) + event->pmu->enable(event); +} + +/* + * Called to enable a whole group of events. + * Returns 1 if the group was enabled, or -EAGAIN if it could not be. + * Assumes the caller has disabled interrupts and has + * frozen the PMU with hw_perf_save_disable. + */ +int hw_perf_group_sched_in(struct perf_event *group_leader, + struct perf_cpu_context *cpuctx, + struct perf_event_context *ctx, int cpu) +{ + struct cpu_hw_events *cpuhw; + long i, n, n0; + struct perf_event *sub; + + if (!ppmu) + return 0; + cpuhw = &__get_cpu_var(cpu_hw_events); + n0 = cpuhw->n_events; + n = collect_events(group_leader, ppmu->n_event - n0, + &cpuhw->event[n0], &cpuhw->events[n0], + &cpuhw->flags[n0]); + if (n < 0) + return -EAGAIN; + if (check_excludes(cpuhw->event, cpuhw->flags, n0, n)) + return -EAGAIN; + i = power_check_constraints(cpuhw, cpuhw->events, cpuhw->flags, n + n0); + if (i < 0) + return -EAGAIN; + cpuhw->n_events = n0 + n; + cpuhw->n_added += n; + + /* + * OK, this group can go on; update event states etc., + * and enable any software events + */ + for (i = n0; i < n0 + n; ++i) + cpuhw->event[i]->hw.config = cpuhw->events[i]; + cpuctx->active_oncpu += n; + n = 1; + event_sched_in(group_leader, cpu); + list_for_each_entry(sub, &group_leader->sibling_list, list_entry) { + if (sub->state != PERF_EVENT_STATE_OFF) { + event_sched_in(sub, cpu); + ++n; + } + } + ctx->nr_active += n; + + return 1; +} + +/* + * Add a event to the PMU. + * If all events are not already frozen, then we disable and + * re-enable the PMU in order to get hw_perf_enable to do the + * actual work of reconfiguring the PMU. + */ +static int power_pmu_enable(struct perf_event *event) +{ + struct cpu_hw_events *cpuhw; + unsigned long flags; + int n0; + int ret = -EAGAIN; + + local_irq_save(flags); + perf_disable(); + + /* + * Add the event to the list (if there is room) + * and check whether the total set is still feasible. + */ + cpuhw = &__get_cpu_var(cpu_hw_events); + n0 = cpuhw->n_events; + if (n0 >= ppmu->n_event) + goto out; + cpuhw->event[n0] = event; + cpuhw->events[n0] = event->hw.config; + cpuhw->flags[n0] = event->hw.event_base; + if (check_excludes(cpuhw->event, cpuhw->flags, n0, 1)) + goto out; + if (power_check_constraints(cpuhw, cpuhw->events, cpuhw->flags, n0 + 1)) + goto out; + + event->hw.config = cpuhw->events[n0]; + ++cpuhw->n_events; + ++cpuhw->n_added; + + ret = 0; + out: + perf_enable(); + local_irq_restore(flags); + return ret; +} + +/* + * Remove a event from the PMU. + */ +static void power_pmu_disable(struct perf_event *event) +{ + struct cpu_hw_events *cpuhw; + long i; + unsigned long flags; + + local_irq_save(flags); + perf_disable(); + + power_pmu_read(event); + + cpuhw = &__get_cpu_var(cpu_hw_events); + for (i = 0; i < cpuhw->n_events; ++i) { + if (event == cpuhw->event[i]) { + while (++i < cpuhw->n_events) + cpuhw->event[i-1] = cpuhw->event[i]; + --cpuhw->n_events; + ppmu->disable_pmc(event->hw.idx - 1, cpuhw->mmcr); + if (event->hw.idx) { + write_pmc(event->hw.idx, 0); + event->hw.idx = 0; + } + perf_event_update_userpage(event); + break; + } + } + for (i = 0; i < cpuhw->n_limited; ++i) + if (event == cpuhw->limited_event[i]) + break; + if (i < cpuhw->n_limited) { + while (++i < cpuhw->n_limited) { + cpuhw->limited_event[i-1] = cpuhw->limited_event[i]; + cpuhw->limited_hwidx[i-1] = cpuhw->limited_hwidx[i]; + } + --cpuhw->n_limited; + } + if (cpuhw->n_events == 0) { + /* disable exceptions if no events are running */ + cpuhw->mmcr[0] &= ~(MMCR0_PMXE | MMCR0_FCECE); + } + + perf_enable(); + local_irq_restore(flags); +} + +/* + * Re-enable interrupts on a event after they were throttled + * because they were coming too fast. + */ +static void power_pmu_unthrottle(struct perf_event *event) +{ + s64 val, left; + unsigned long flags; + + if (!event->hw.idx || !event->hw.sample_period) + return; + local_irq_save(flags); + perf_disable(); + power_pmu_read(event); + left = event->hw.sample_period; + event->hw.last_period = left; + val = 0; + if (left < 0x80000000L) + val = 0x80000000L - left; + write_pmc(event->hw.idx, val); + atomic64_set(&event->hw.prev_count, val); + atomic64_set(&event->hw.period_left, left); + perf_event_update_userpage(event); + perf_enable(); + local_irq_restore(flags); +} + +struct pmu power_pmu = { + .enable = power_pmu_enable, + .disable = power_pmu_disable, + .read = power_pmu_read, + .unthrottle = power_pmu_unthrottle, +}; + +/* + * Return 1 if we might be able to put event on a limited PMC, + * or 0 if not. + * A event can only go on a limited PMC if it counts something + * that a limited PMC can count, doesn't require interrupts, and + * doesn't exclude any processor mode. + */ +static int can_go_on_limited_pmc(struct perf_event *event, u64 ev, + unsigned int flags) +{ + int n; + u64 alt[MAX_EVENT_ALTERNATIVES]; + + if (event->attr.exclude_user + || event->attr.exclude_kernel + || event->attr.exclude_hv + || event->attr.sample_period) + return 0; + + if (ppmu->limited_pmc_event(ev)) + return 1; + + /* + * The requested event_id isn't on a limited PMC already; + * see if any alternative code goes on a limited PMC. + */ + if (!ppmu->get_alternatives) + return 0; + + flags |= PPMU_LIMITED_PMC_OK | PPMU_LIMITED_PMC_REQD; + n = ppmu->get_alternatives(ev, flags, alt); + + return n > 0; +} + +/* + * Find an alternative event_id that goes on a normal PMC, if possible, + * and return the event_id code, or 0 if there is no such alternative. + * (Note: event_id code 0 is "don't count" on all machines.) + */ +static u64 normal_pmc_alternative(u64 ev, unsigned long flags) +{ + u64 alt[MAX_EVENT_ALTERNATIVES]; + int n; + + flags &= ~(PPMU_LIMITED_PMC_OK | PPMU_LIMITED_PMC_REQD); + n = ppmu->get_alternatives(ev, flags, alt); + if (!n) + return 0; + return alt[0]; +} + +/* Number of perf_events counting hardware events */ +static atomic_t num_events; +/* Used to avoid races in calling reserve/release_pmc_hardware */ +static DEFINE_MUTEX(pmc_reserve_mutex); + +/* + * Release the PMU if this is the last perf_event. + */ +static void hw_perf_event_destroy(struct perf_event *event) +{ + if (!atomic_add_unless(&num_events, -1, 1)) { + mutex_lock(&pmc_reserve_mutex); + if (atomic_dec_return(&num_events) == 0) + release_pmc_hardware(); + mutex_unlock(&pmc_reserve_mutex); + } +} + +/* + * Translate a generic cache event_id config to a raw event_id code. + */ +static int hw_perf_cache_event(u64 config, u64 *eventp) +{ + unsigned long type, op, result; + int ev; + + if (!ppmu->cache_events) + return -EINVAL; + + /* unpack config */ + type = config & 0xff; + op = (config >> 8) & 0xff; + result = (config >> 16) & 0xff; + + if (type >= PERF_COUNT_HW_CACHE_MAX || + op >= PERF_COUNT_HW_CACHE_OP_MAX || + result >= PERF_COUNT_HW_CACHE_RESULT_MAX) + return -EINVAL; + + ev = (*ppmu->cache_events)[type][op][result]; + if (ev == 0) + return -EOPNOTSUPP; + if (ev == -1) + return -EINVAL; + *eventp = ev; + return 0; +} + +const struct pmu *hw_perf_event_init(struct perf_event *event) +{ + u64 ev; + unsigned long flags; + struct perf_event *ctrs[MAX_HWEVENTS]; + u64 events[MAX_HWEVENTS]; + unsigned int cflags[MAX_HWEVENTS]; + int n; + int err; + struct cpu_hw_events *cpuhw; + + if (!ppmu) + return ERR_PTR(-ENXIO); + switch (event->attr.type) { + case PERF_TYPE_HARDWARE: + ev = event->attr.config; + if (ev >= ppmu->n_generic || ppmu->generic_events[ev] == 0) + return ERR_PTR(-EOPNOTSUPP); + ev = ppmu->generic_events[ev]; + break; + case PERF_TYPE_HW_CACHE: + err = hw_perf_cache_event(event->attr.config, &ev); + if (err) + return ERR_PTR(err); + break; + case PERF_TYPE_RAW: + ev = event->attr.config; + break; + default: + return ERR_PTR(-EINVAL); + } + event->hw.config_base = ev; + event->hw.idx = 0; + + /* + * If we are not running on a hypervisor, force the + * exclude_hv bit to 0 so that we don't care what + * the user set it to. + */ + if (!firmware_has_feature(FW_FEATURE_LPAR)) + event->attr.exclude_hv = 0; + + /* + * If this is a per-task event, then we can use + * PM_RUN_* events interchangeably with their non RUN_* + * equivalents, e.g. PM_RUN_CYC instead of PM_CYC. + * XXX we should check if the task is an idle task. + */ + flags = 0; + if (event->ctx->task) + flags |= PPMU_ONLY_COUNT_RUN; + + /* + * If this machine has limited events, check whether this + * event_id could go on a limited event. + */ + if (ppmu->flags & PPMU_LIMITED_PMC5_6) { + if (can_go_on_limited_pmc(event, ev, flags)) { + flags |= PPMU_LIMITED_PMC_OK; + } else if (ppmu->limited_pmc_event(ev)) { + /* + * The requested event_id is on a limited PMC, + * but we can't use a limited PMC; see if any + * alternative goes on a normal PMC. + */ + ev = normal_pmc_alternative(ev, flags); + if (!ev) + return ERR_PTR(-EINVAL); + } + } + + /* + * If this is in a group, check if it can go on with all the + * other hardware events in the group. We assume the event + * hasn't been linked into its leader's sibling list at this point. + */ + n = 0; + if (event->group_leader != event) { + n = collect_events(event->group_leader, ppmu->n_event - 1, + ctrs, events, cflags); + if (n < 0) + return ERR_PTR(-EINVAL); + } + events[n] = ev; + ctrs[n] = event; + cflags[n] = flags; + if (check_excludes(ctrs, cflags, n, 1)) + return ERR_PTR(-EINVAL); + + cpuhw = &get_cpu_var(cpu_hw_events); + err = power_check_constraints(cpuhw, events, cflags, n + 1); + put_cpu_var(cpu_hw_events); + if (err) + return ERR_PTR(-EINVAL); + + event->hw.config = events[n]; + event->hw.event_base = cflags[n]; + event->hw.last_period = event->hw.sample_period; + atomic64_set(&event->hw.period_left, event->hw.last_period); + + /* + * See if we need to reserve the PMU. + * If no events are currently in use, then we have to take a + * mutex to ensure that we don't race with another task doing + * reserve_pmc_hardware or release_pmc_hardware. + */ + err = 0; + if (!atomic_inc_not_zero(&num_events)) { + mutex_lock(&pmc_reserve_mutex); + if (atomic_read(&num_events) == 0 && + reserve_pmc_hardware(perf_event_interrupt)) + err = -EBUSY; + else + atomic_inc(&num_events); + mutex_unlock(&pmc_reserve_mutex); + } + event->destroy = hw_perf_event_destroy; + + if (err) + return ERR_PTR(err); + return &power_pmu; +} + +/* + * A event has overflowed; update its count and record + * things if requested. Note that interrupts are hard-disabled + * here so there is no possibility of being interrupted. + */ +static void record_and_restart(struct perf_event *event, unsigned long val, + struct pt_regs *regs, int nmi) +{ + u64 period = event->hw.sample_period; + s64 prev, delta, left; + int record = 0; + + /* we don't have to worry about interrupts here */ + prev = atomic64_read(&event->hw.prev_count); + delta = (val - prev) & 0xfffffffful; + atomic64_add(delta, &event->count); + + /* + * See if the total period for this event has expired, + * and update for the next period. + */ + val = 0; + left = atomic64_read(&event->hw.period_left) - delta; + if (period) { + if (left <= 0) { + left += period; + if (left <= 0) + left = period; + record = 1; + } + if (left < 0x80000000LL) + val = 0x80000000LL - left; + } + + /* + * Finally record data if requested. + */ + if (record) { + struct perf_sample_data data = { + .addr = 0, + .period = event->hw.last_period, + }; + + if (event->attr.sample_type & PERF_SAMPLE_ADDR) + perf_get_data_addr(regs, &data.addr); + + if (perf_event_overflow(event, nmi, &data, regs)) { + /* + * Interrupts are coming too fast - throttle them + * by setting the event to 0, so it will be + * at least 2^30 cycles until the next interrupt + * (assuming each event counts at most 2 counts + * per cycle). + */ + val = 0; + left = ~0ULL >> 1; + } + } + + write_pmc(event->hw.idx, val); + atomic64_set(&event->hw.prev_count, val); + atomic64_set(&event->hw.period_left, left); + perf_event_update_userpage(event); +} + +/* + * Called from generic code to get the misc flags (i.e. processor mode) + * for an event_id. + */ +unsigned long perf_misc_flags(struct pt_regs *regs) +{ + u32 flags = perf_get_misc_flags(regs); + + if (flags) + return flags; + return user_mode(regs) ? PERF_RECORD_MISC_USER : + PERF_RECORD_MISC_KERNEL; +} + +/* + * Called from generic code to get the instruction pointer + * for an event_id. + */ +unsigned long perf_instruction_pointer(struct pt_regs *regs) +{ + unsigned long ip; + + if (TRAP(regs) != 0xf00) + return regs->nip; /* not a PMU interrupt */ + + ip = mfspr(SPRN_SIAR) + perf_ip_adjust(regs); + return ip; +} + +/* + * Performance monitor interrupt stuff + */ +static void perf_event_interrupt(struct pt_regs *regs) +{ + int i; + struct cpu_hw_events *cpuhw = &__get_cpu_var(cpu_hw_events); + struct perf_event *event; + unsigned long val; + int found = 0; + int nmi; + + if (cpuhw->n_limited) + freeze_limited_events(cpuhw, mfspr(SPRN_PMC5), + mfspr(SPRN_PMC6)); + + perf_read_regs(regs); + + nmi = perf_intr_is_nmi(regs); + if (nmi) + nmi_enter(); + else + irq_enter(); + + for (i = 0; i < cpuhw->n_events; ++i) { + event = cpuhw->event[i]; + if (!event->hw.idx || is_limited_pmc(event->hw.idx)) + continue; + val = read_pmc(event->hw.idx); + if ((int)val < 0) { + /* event has overflowed */ + found = 1; + record_and_restart(event, val, regs, nmi); + } + } + + /* + * In case we didn't find and reset the event that caused + * the interrupt, scan all events and reset any that are + * negative, to avoid getting continual interrupts. + * Any that we processed in the previous loop will not be negative. + */ + if (!found) { + for (i = 0; i < ppmu->n_event; ++i) { + if (is_limited_pmc(i + 1)) + continue; + val = read_pmc(i + 1); + if ((int)val < 0) + write_pmc(i + 1, 0); + } + } + + /* + * Reset MMCR0 to its normal value. This will set PMXE and + * clear FC (freeze events) and PMAO (perf mon alert occurred) + * and thus allow interrupts to occur again. + * XXX might want to use MSR.PM to keep the events frozen until + * we get back out of this interrupt. + */ + write_mmcr0(cpuhw, cpuhw->mmcr[0]); + + if (nmi) + nmi_exit(); + else + irq_exit(); +} + +void hw_perf_event_setup(int cpu) +{ + struct cpu_hw_events *cpuhw = &per_cpu(cpu_hw_events, cpu); + + if (!ppmu) + return; + memset(cpuhw, 0, sizeof(*cpuhw)); + cpuhw->mmcr[0] = MMCR0_FC; +} + +int register_power_pmu(struct power_pmu *pmu) +{ + if (ppmu) + return -EBUSY; /* something's already registered */ + + ppmu = pmu; + pr_info("%s performance monitor hardware support registered\n", + pmu->name); + +#ifdef MSR_HV + /* + * Use FCHV to ignore kernel events if MSR.HV is set. + */ + if (mfmsr() & MSR_HV) + freeze_events_kernel = MMCR0_FCHV; +#endif /* CONFIG_PPC64 */ + + return 0; +} diff --git a/arch/powerpc/kernel/power4-pmu.c b/arch/powerpc/kernel/power4-pmu.c index 3c90a3d9173e..2a361cdda635 100644 --- a/arch/powerpc/kernel/power4-pmu.c +++ b/arch/powerpc/kernel/power4-pmu.c @@ -9,7 +9,7 @@ * 2 of the License, or (at your option) any later version. */ #include -#include +#include #include #include #include diff --git a/arch/powerpc/kernel/power5+-pmu.c b/arch/powerpc/kernel/power5+-pmu.c index 31918af3e355..0f4c1c73a6ad 100644 --- a/arch/powerpc/kernel/power5+-pmu.c +++ b/arch/powerpc/kernel/power5+-pmu.c @@ -9,7 +9,7 @@ * 2 of the License, or (at your option) any later version. */ #include -#include +#include #include #include #include diff --git a/arch/powerpc/kernel/power5-pmu.c b/arch/powerpc/kernel/power5-pmu.c index 867f6f663963..c351b3a57fbb 100644 --- a/arch/powerpc/kernel/power5-pmu.c +++ b/arch/powerpc/kernel/power5-pmu.c @@ -9,7 +9,7 @@ * 2 of the License, or (at your option) any later version. */ #include -#include +#include #include #include #include diff --git a/arch/powerpc/kernel/power6-pmu.c b/arch/powerpc/kernel/power6-pmu.c index fa21890531da..ca399ba5034c 100644 --- a/arch/powerpc/kernel/power6-pmu.c +++ b/arch/powerpc/kernel/power6-pmu.c @@ -9,7 +9,7 @@ * 2 of the License, or (at your option) any later version. */ #include -#include +#include #include #include #include diff --git a/arch/powerpc/kernel/power7-pmu.c b/arch/powerpc/kernel/power7-pmu.c index 018d094d92f9..28a4daacdc02 100644 --- a/arch/powerpc/kernel/power7-pmu.c +++ b/arch/powerpc/kernel/power7-pmu.c @@ -9,7 +9,7 @@ * 2 of the License, or (at your option) any later version. */ #include -#include +#include #include #include #include diff --git a/arch/powerpc/kernel/ppc970-pmu.c b/arch/powerpc/kernel/ppc970-pmu.c index 75dccb71a043..479574413a93 100644 --- a/arch/powerpc/kernel/ppc970-pmu.c +++ b/arch/powerpc/kernel/ppc970-pmu.c @@ -9,7 +9,7 @@ * 2 of the License, or (at your option) any later version. */ #include -#include +#include #include #include diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c index 465e498bcb33..df45a7449a66 100644 --- a/arch/powerpc/kernel/time.c +++ b/arch/powerpc/kernel/time.c @@ -53,7 +53,7 @@ #include #include #include -#include +#include #include #include @@ -527,25 +527,25 @@ void __init iSeries_time_init_early(void) } #endif /* CONFIG_PPC_ISERIES */ -#if defined(CONFIG_PERF_COUNTERS) && defined(CONFIG_PPC32) -DEFINE_PER_CPU(u8, perf_counter_pending); +#if defined(CONFIG_PERF_EVENTS) && defined(CONFIG_PPC32) +DEFINE_PER_CPU(u8, perf_event_pending); -void set_perf_counter_pending(void) +void set_perf_event_pending(void) { - get_cpu_var(perf_counter_pending) = 1; + get_cpu_var(perf_event_pending) = 1; set_dec(1); - put_cpu_var(perf_counter_pending); + put_cpu_var(perf_event_pending); } -#define test_perf_counter_pending() __get_cpu_var(perf_counter_pending) -#define clear_perf_counter_pending() __get_cpu_var(perf_counter_pending) = 0 +#define test_perf_event_pending() __get_cpu_var(perf_event_pending) +#define clear_perf_event_pending() __get_cpu_var(perf_event_pending) = 0 -#else /* CONFIG_PERF_COUNTERS && CONFIG_PPC32 */ +#else /* CONFIG_PERF_EVENTS && CONFIG_PPC32 */ -#define test_perf_counter_pending() 0 -#define clear_perf_counter_pending() +#define test_perf_event_pending() 0 +#define clear_perf_event_pending() -#endif /* CONFIG_PERF_COUNTERS && CONFIG_PPC32 */ +#endif /* CONFIG_PERF_EVENTS && CONFIG_PPC32 */ /* * For iSeries shared processors, we have to let the hypervisor @@ -573,9 +573,9 @@ void timer_interrupt(struct pt_regs * regs) set_dec(DECREMENTER_MAX); #ifdef CONFIG_PPC32 - if (test_perf_counter_pending()) { - clear_perf_counter_pending(); - perf_counter_do_pending(); + if (test_perf_event_pending()) { + clear_perf_event_pending(); + perf_event_do_pending(); } if (atomic_read(&ppc_n_lost_interrupts) != 0) do_IRQ(regs); diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c index 830bef0a1131..e7dae82c1285 100644 --- a/arch/powerpc/mm/fault.c +++ b/arch/powerpc/mm/fault.c @@ -29,7 +29,7 @@ #include #include #include -#include +#include #include #include @@ -171,7 +171,7 @@ int __kprobes do_page_fault(struct pt_regs *regs, unsigned long address, die("Weird page fault", regs, SIGSEGV); } - perf_swcounter_event(PERF_COUNT_SW_PAGE_FAULTS, 1, 0, regs, address); + perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, 0, regs, address); /* When running in the kernel we expect faults to occur only to * addresses in user space. All other faults represent errors in the @@ -312,7 +312,7 @@ good_area: } if (ret & VM_FAULT_MAJOR) { current->maj_flt++; - perf_swcounter_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, 0, + perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, 0, regs, address); #ifdef CONFIG_PPC_SMLPAR if (firmware_has_feature(FW_FEATURE_CMO)) { @@ -323,7 +323,7 @@ good_area: #endif } else { current->min_flt++; - perf_swcounter_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, 0, + perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, 0, regs, address); } up_read(&mm->mmap_sem); diff --git a/arch/powerpc/platforms/Kconfig.cputype b/arch/powerpc/platforms/Kconfig.cputype index 9efc8bda01b4..e382cae678b8 100644 --- a/arch/powerpc/platforms/Kconfig.cputype +++ b/arch/powerpc/platforms/Kconfig.cputype @@ -280,9 +280,9 @@ config PPC_HAVE_PMU_SUPPORT config PPC_PERF_CTRS def_bool y - depends on PERF_COUNTERS && PPC_HAVE_PMU_SUPPORT + depends on PERF_EVENTS && PPC_HAVE_PMU_SUPPORT help - This enables the powerpc-specific perf_counter back-end. + This enables the powerpc-specific perf_event back-end. config SMP depends on PPC_BOOK3S || PPC_BOOK3E || FSL_BOOKE diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig index 1c866efd217d..43c0acad7160 100644 --- a/arch/s390/Kconfig +++ b/arch/s390/Kconfig @@ -94,7 +94,7 @@ config S390 select HAVE_KVM if 64BIT select HAVE_ARCH_TRACEHOOK select INIT_ALL_POSSIBLE - select HAVE_PERF_COUNTERS + select HAVE_PERF_EVENTS config SCHED_OMIT_FRAME_POINTER bool diff --git a/arch/s390/include/asm/perf_counter.h b/arch/s390/include/asm/perf_counter.h deleted file mode 100644 index 7015188c2cc2..000000000000 --- a/arch/s390/include/asm/perf_counter.h +++ /dev/null @@ -1,10 +0,0 @@ -/* - * Performance counter support - s390 specific definitions. - * - * Copyright 2009 Martin Schwidefsky, IBM Corporation. - */ - -static inline void set_perf_counter_pending(void) {} -static inline void clear_perf_counter_pending(void) {} - -#define PERF_COUNTER_INDEX_OFFSET 0 diff --git a/arch/s390/include/asm/perf_event.h b/arch/s390/include/asm/perf_event.h new file mode 100644 index 000000000000..3840cbe77637 --- /dev/null +++ b/arch/s390/include/asm/perf_event.h @@ -0,0 +1,10 @@ +/* + * Performance event support - s390 specific definitions. + * + * Copyright 2009 Martin Schwidefsky, IBM Corporation. + */ + +static inline void set_perf_event_pending(void) {} +static inline void clear_perf_event_pending(void) {} + +#define PERF_EVENT_INDEX_OFFSET 0 diff --git a/arch/s390/include/asm/unistd.h b/arch/s390/include/asm/unistd.h index c80602d7c880..cb5232df151e 100644 --- a/arch/s390/include/asm/unistd.h +++ b/arch/s390/include/asm/unistd.h @@ -268,7 +268,7 @@ #define __NR_preadv 328 #define __NR_pwritev 329 #define __NR_rt_tgsigqueueinfo 330 -#define __NR_perf_counter_open 331 +#define __NR_perf_event_open 331 #define NR_syscalls 332 /* diff --git a/arch/s390/kernel/compat_wrapper.S b/arch/s390/kernel/compat_wrapper.S index 88a83366819f..624790042d41 100644 --- a/arch/s390/kernel/compat_wrapper.S +++ b/arch/s390/kernel/compat_wrapper.S @@ -1832,11 +1832,11 @@ compat_sys_rt_tgsigqueueinfo_wrapper: llgtr %r5,%r5 # struct compat_siginfo * jg compat_sys_rt_tgsigqueueinfo_wrapper # branch to system call - .globl sys_perf_counter_open_wrapper -sys_perf_counter_open_wrapper: - llgtr %r2,%r2 # const struct perf_counter_attr * + .globl sys_perf_event_open_wrapper +sys_perf_event_open_wrapper: + llgtr %r2,%r2 # const struct perf_event_attr * lgfr %r3,%r3 # pid_t lgfr %r4,%r4 # int lgfr %r5,%r5 # int llgfr %r6,%r6 # unsigned long - jg sys_perf_counter_open # branch to system call + jg sys_perf_event_open # branch to system call diff --git a/arch/s390/kernel/syscalls.S b/arch/s390/kernel/syscalls.S index ad1acd200385..0b5083681e77 100644 --- a/arch/s390/kernel/syscalls.S +++ b/arch/s390/kernel/syscalls.S @@ -339,4 +339,4 @@ SYSCALL(sys_epoll_create1,sys_epoll_create1,sys_epoll_create1_wrapper) SYSCALL(sys_preadv,sys_preadv,compat_sys_preadv_wrapper) SYSCALL(sys_pwritev,sys_pwritev,compat_sys_pwritev_wrapper) SYSCALL(sys_rt_tgsigqueueinfo,sys_rt_tgsigqueueinfo,compat_sys_rt_tgsigqueueinfo_wrapper) /* 330 */ -SYSCALL(sys_perf_counter_open,sys_perf_counter_open,sys_perf_counter_open_wrapper) +SYSCALL(sys_perf_event_open,sys_perf_event_open,sys_perf_event_open_wrapper) diff --git a/arch/s390/mm/fault.c b/arch/s390/mm/fault.c index 1abbadd497e1..6d507462967a 100644 --- a/arch/s390/mm/fault.c +++ b/arch/s390/mm/fault.c @@ -10,7 +10,7 @@ * Copyright (C) 1995 Linus Torvalds */ -#include +#include #include #include #include @@ -306,7 +306,7 @@ do_exception(struct pt_regs *regs, unsigned long error_code, int write) * interrupts again and then search the VMAs */ local_irq_enable(); - perf_swcounter_event(PERF_COUNT_SW_PAGE_FAULTS, 1, 0, regs, address); + perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, 0, regs, address); down_read(&mm->mmap_sem); si_code = SEGV_MAPERR; @@ -366,11 +366,11 @@ good_area: } if (fault & VM_FAULT_MAJOR) { tsk->maj_flt++; - perf_swcounter_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, 0, + perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, 0, regs, address); } else { tsk->min_flt++; - perf_swcounter_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, 0, + perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, 0, regs, address); } up_read(&mm->mmap_sem); diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig index 4df3570fe511..b940424f8ccc 100644 --- a/arch/sh/Kconfig +++ b/arch/sh/Kconfig @@ -16,7 +16,7 @@ config SUPERH select HAVE_IOREMAP_PROT if MMU select HAVE_ARCH_TRACEHOOK select HAVE_DMA_API_DEBUG - select HAVE_PERF_COUNTERS + select HAVE_PERF_EVENTS select HAVE_KERNEL_GZIP select HAVE_KERNEL_BZIP2 select HAVE_KERNEL_LZMA diff --git a/arch/sh/include/asm/perf_counter.h b/arch/sh/include/asm/perf_counter.h deleted file mode 100644 index d8e6bb9c0ccc..000000000000 --- a/arch/sh/include/asm/perf_counter.h +++ /dev/null @@ -1,9 +0,0 @@ -#ifndef __ASM_SH_PERF_COUNTER_H -#define __ASM_SH_PERF_COUNTER_H - -/* SH only supports software counters through this interface. */ -static inline void set_perf_counter_pending(void) {} - -#define PERF_COUNTER_INDEX_OFFSET 0 - -#endif /* __ASM_SH_PERF_COUNTER_H */ diff --git a/arch/sh/include/asm/perf_event.h b/arch/sh/include/asm/perf_event.h new file mode 100644 index 000000000000..11a302297ab7 --- /dev/null +++ b/arch/sh/include/asm/perf_event.h @@ -0,0 +1,9 @@ +#ifndef __ASM_SH_PERF_EVENT_H +#define __ASM_SH_PERF_EVENT_H + +/* SH only supports software events through this interface. */ +static inline void set_perf_event_pending(void) {} + +#define PERF_EVENT_INDEX_OFFSET 0 + +#endif /* __ASM_SH_PERF_EVENT_H */ diff --git a/arch/sh/include/asm/unistd_32.h b/arch/sh/include/asm/unistd_32.h index 925dd40d9d55..f3fd1b9eb6b1 100644 --- a/arch/sh/include/asm/unistd_32.h +++ b/arch/sh/include/asm/unistd_32.h @@ -344,7 +344,7 @@ #define __NR_preadv 333 #define __NR_pwritev 334 #define __NR_rt_tgsigqueueinfo 335 -#define __NR_perf_counter_open 336 +#define __NR_perf_event_open 336 #define NR_syscalls 337 diff --git a/arch/sh/include/asm/unistd_64.h b/arch/sh/include/asm/unistd_64.h index 2b84bc916bc5..343ce8f073ea 100644 --- a/arch/sh/include/asm/unistd_64.h +++ b/arch/sh/include/asm/unistd_64.h @@ -384,7 +384,7 @@ #define __NR_preadv 361 #define __NR_pwritev 362 #define __NR_rt_tgsigqueueinfo 363 -#define __NR_perf_counter_open 364 +#define __NR_perf_event_open 364 #ifdef __KERNEL__ diff --git a/arch/sh/kernel/syscalls_32.S b/arch/sh/kernel/syscalls_32.S index 16ba225ede89..19fd11dd9871 100644 --- a/arch/sh/kernel/syscalls_32.S +++ b/arch/sh/kernel/syscalls_32.S @@ -352,4 +352,4 @@ ENTRY(sys_call_table) .long sys_preadv .long sys_pwritev .long sys_rt_tgsigqueueinfo /* 335 */ - .long sys_perf_counter_open + .long sys_perf_event_open diff --git a/arch/sh/kernel/syscalls_64.S b/arch/sh/kernel/syscalls_64.S index af6fb7410c21..5bfde6c77498 100644 --- a/arch/sh/kernel/syscalls_64.S +++ b/arch/sh/kernel/syscalls_64.S @@ -390,4 +390,4 @@ sys_call_table: .long sys_preadv .long sys_pwritev .long sys_rt_tgsigqueueinfo - .long sys_perf_counter_open + .long sys_perf_event_open diff --git a/arch/sh/mm/fault_32.c b/arch/sh/mm/fault_32.c index 781b413ff82d..47530104e0ad 100644 --- a/arch/sh/mm/fault_32.c +++ b/arch/sh/mm/fault_32.c @@ -15,7 +15,7 @@ #include #include #include -#include +#include #include #include #include @@ -157,7 +157,7 @@ asmlinkage void __kprobes do_page_fault(struct pt_regs *regs, if ((regs->sr & SR_IMASK) != SR_IMASK) local_irq_enable(); - perf_swcounter_event(PERF_COUNT_SW_PAGE_FAULTS, 1, 0, regs, address); + perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, 0, regs, address); /* * If we're in an interrupt, have no user context or are running @@ -208,11 +208,11 @@ survive: } if (fault & VM_FAULT_MAJOR) { tsk->maj_flt++; - perf_swcounter_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, 0, + perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, 0, regs, address); } else { tsk->min_flt++; - perf_swcounter_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, 0, + perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, 0, regs, address); } diff --git a/arch/sh/mm/tlbflush_64.c b/arch/sh/mm/tlbflush_64.c index 2dcc48528f7a..de0b0e881823 100644 --- a/arch/sh/mm/tlbflush_64.c +++ b/arch/sh/mm/tlbflush_64.c @@ -20,7 +20,7 @@ #include #include #include -#include +#include #include #include #include @@ -116,7 +116,7 @@ asmlinkage void do_page_fault(struct pt_regs *regs, unsigned long writeaccess, /* Not an IO address, so reenable interrupts */ local_irq_enable(); - perf_swcounter_event(PERF_COUNT_SW_PAGE_FAULTS, 1, 0, regs, address); + perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, 0, regs, address); /* * If we're in an interrupt or have no user @@ -201,11 +201,11 @@ survive: if (fault & VM_FAULT_MAJOR) { tsk->maj_flt++; - perf_swcounter_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, 0, + perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, 0, regs, address); } else { tsk->min_flt++; - perf_swcounter_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, 0, + perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, 0, regs, address); } diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig index 86b82348b97c..97fca4695e0b 100644 --- a/arch/sparc/Kconfig +++ b/arch/sparc/Kconfig @@ -25,7 +25,7 @@ config SPARC select ARCH_WANT_OPTIONAL_GPIOLIB select RTC_CLASS select RTC_DRV_M48T59 - select HAVE_PERF_COUNTERS + select HAVE_PERF_EVENTS select HAVE_DMA_ATTRS select HAVE_DMA_API_DEBUG @@ -47,7 +47,7 @@ config SPARC64 select RTC_DRV_BQ4802 select RTC_DRV_SUN4V select RTC_DRV_STARFIRE - select HAVE_PERF_COUNTERS + select HAVE_PERF_EVENTS config ARCH_DEFCONFIG string diff --git a/arch/sparc/include/asm/perf_counter.h b/arch/sparc/include/asm/perf_counter.h deleted file mode 100644 index 5d7a8ca0e491..000000000000 --- a/arch/sparc/include/asm/perf_counter.h +++ /dev/null @@ -1,14 +0,0 @@ -#ifndef __ASM_SPARC_PERF_COUNTER_H -#define __ASM_SPARC_PERF_COUNTER_H - -extern void set_perf_counter_pending(void); - -#define PERF_COUNTER_INDEX_OFFSET 0 - -#ifdef CONFIG_PERF_COUNTERS -extern void init_hw_perf_counters(void); -#else -static inline void init_hw_perf_counters(void) { } -#endif - -#endif diff --git a/arch/sparc/include/asm/perf_event.h b/arch/sparc/include/asm/perf_event.h new file mode 100644 index 000000000000..7e2669894ce8 --- /dev/null +++ b/arch/sparc/include/asm/perf_event.h @@ -0,0 +1,14 @@ +#ifndef __ASM_SPARC_PERF_EVENT_H +#define __ASM_SPARC_PERF_EVENT_H + +extern void set_perf_event_pending(void); + +#define PERF_EVENT_INDEX_OFFSET 0 + +#ifdef CONFIG_PERF_EVENTS +extern void init_hw_perf_events(void); +#else +static inline void init_hw_perf_events(void) { } +#endif + +#endif diff --git a/arch/sparc/include/asm/unistd.h b/arch/sparc/include/asm/unistd.h index 706df669f3b8..42f2316c3eaa 100644 --- a/arch/sparc/include/asm/unistd.h +++ b/arch/sparc/include/asm/unistd.h @@ -395,7 +395,7 @@ #define __NR_preadv 324 #define __NR_pwritev 325 #define __NR_rt_tgsigqueueinfo 326 -#define __NR_perf_counter_open 327 +#define __NR_perf_event_open 327 #define NR_SYSCALLS 328 diff --git a/arch/sparc/kernel/Makefile b/arch/sparc/kernel/Makefile index 247cc620cee5..3a048fad7ee2 100644 --- a/arch/sparc/kernel/Makefile +++ b/arch/sparc/kernel/Makefile @@ -104,5 +104,5 @@ obj-$(CONFIG_AUDIT) += audit.o audit--$(CONFIG_AUDIT) := compat_audit.o obj-$(CONFIG_COMPAT) += $(audit--y) -pc--$(CONFIG_PERF_COUNTERS) := perf_counter.o +pc--$(CONFIG_PERF_EVENTS) := perf_event.o obj-$(CONFIG_SPARC64) += $(pc--y) diff --git a/arch/sparc/kernel/nmi.c b/arch/sparc/kernel/nmi.c index 378eb53e0776..b129611590a4 100644 --- a/arch/sparc/kernel/nmi.c +++ b/arch/sparc/kernel/nmi.c @@ -19,7 +19,7 @@ #include #include -#include +#include #include #include #include @@ -265,7 +265,7 @@ int __init nmi_init(void) } } if (!err) - init_hw_perf_counters(); + init_hw_perf_events(); return err; } diff --git a/arch/sparc/kernel/pcr.c b/arch/sparc/kernel/pcr.c index 68ff00107073..2d94e7a03af5 100644 --- a/arch/sparc/kernel/pcr.c +++ b/arch/sparc/kernel/pcr.c @@ -7,7 +7,7 @@ #include #include -#include +#include #include #include @@ -15,7 +15,7 @@ /* This code is shared between various users of the performance * counters. Users will be oprofile, pseudo-NMI watchdog, and the - * perf_counter support layer. + * perf_event support layer. */ #define PCR_SUN4U_ENABLE (PCR_PIC_PRIV | PCR_STRACE | PCR_UTRACE) @@ -42,14 +42,14 @@ void deferred_pcr_work_irq(int irq, struct pt_regs *regs) old_regs = set_irq_regs(regs); irq_enter(); -#ifdef CONFIG_PERF_COUNTERS - perf_counter_do_pending(); +#ifdef CONFIG_PERF_EVENTS + perf_event_do_pending(); #endif irq_exit(); set_irq_regs(old_regs); } -void set_perf_counter_pending(void) +void set_perf_event_pending(void) { set_softint(1 << PIL_DEFERRED_PCR_WORK); } diff --git a/arch/sparc/kernel/perf_counter.c b/arch/sparc/kernel/perf_counter.c deleted file mode 100644 index b1265ce8a053..000000000000 --- a/arch/sparc/kernel/perf_counter.c +++ /dev/null @@ -1,556 +0,0 @@ -/* Performance counter support for sparc64. - * - * Copyright (C) 2009 David S. Miller - * - * This code is based almost entirely upon the x86 perf counter - * code, which is: - * - * Copyright (C) 2008 Thomas Gleixner - * Copyright (C) 2008-2009 Red Hat, Inc., Ingo Molnar - * Copyright (C) 2009 Jaswinder Singh Rajput - * Copyright (C) 2009 Advanced Micro Devices, Inc., Robert Richter - * Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra - */ - -#include -#include -#include -#include -#include - -#include -#include -#include -#include - -/* Sparc64 chips have two performance counters, 32-bits each, with - * overflow interrupts generated on transition from 0xffffffff to 0. - * The counters are accessed in one go using a 64-bit register. - * - * Both counters are controlled using a single control register. The - * only way to stop all sampling is to clear all of the context (user, - * supervisor, hypervisor) sampling enable bits. But these bits apply - * to both counters, thus the two counters can't be enabled/disabled - * individually. - * - * The control register has two event fields, one for each of the two - * counters. It's thus nearly impossible to have one counter going - * while keeping the other one stopped. Therefore it is possible to - * get overflow interrupts for counters not currently "in use" and - * that condition must be checked in the overflow interrupt handler. - * - * So we use a hack, in that we program inactive counters with the - * "sw_count0" and "sw_count1" events. These count how many times - * the instruction "sethi %hi(0xfc000), %g0" is executed. It's an - * unusual way to encode a NOP and therefore will not trigger in - * normal code. - */ - -#define MAX_HWCOUNTERS 2 -#define MAX_PERIOD ((1UL << 32) - 1) - -#define PIC_UPPER_INDEX 0 -#define PIC_LOWER_INDEX 1 - -struct cpu_hw_counters { - struct perf_counter *counters[MAX_HWCOUNTERS]; - unsigned long used_mask[BITS_TO_LONGS(MAX_HWCOUNTERS)]; - unsigned long active_mask[BITS_TO_LONGS(MAX_HWCOUNTERS)]; - int enabled; -}; -DEFINE_PER_CPU(struct cpu_hw_counters, cpu_hw_counters) = { .enabled = 1, }; - -struct perf_event_map { - u16 encoding; - u8 pic_mask; -#define PIC_NONE 0x00 -#define PIC_UPPER 0x01 -#define PIC_LOWER 0x02 -}; - -struct sparc_pmu { - const struct perf_event_map *(*event_map)(int); - int max_events; - int upper_shift; - int lower_shift; - int event_mask; - int hv_bit; - int irq_bit; - int upper_nop; - int lower_nop; -}; - -static const struct perf_event_map ultra3i_perfmon_event_map[] = { - [PERF_COUNT_HW_CPU_CYCLES] = { 0x0000, PIC_UPPER | PIC_LOWER }, - [PERF_COUNT_HW_INSTRUCTIONS] = { 0x0001, PIC_UPPER | PIC_LOWER }, - [PERF_COUNT_HW_CACHE_REFERENCES] = { 0x0009, PIC_LOWER }, - [PERF_COUNT_HW_CACHE_MISSES] = { 0x0009, PIC_UPPER }, -}; - -static const struct perf_event_map *ultra3i_event_map(int event) -{ - return &ultra3i_perfmon_event_map[event]; -} - -static const struct sparc_pmu ultra3i_pmu = { - .event_map = ultra3i_event_map, - .max_events = ARRAY_SIZE(ultra3i_perfmon_event_map), - .upper_shift = 11, - .lower_shift = 4, - .event_mask = 0x3f, - .upper_nop = 0x1c, - .lower_nop = 0x14, -}; - -static const struct perf_event_map niagara2_perfmon_event_map[] = { - [PERF_COUNT_HW_CPU_CYCLES] = { 0x02ff, PIC_UPPER | PIC_LOWER }, - [PERF_COUNT_HW_INSTRUCTIONS] = { 0x02ff, PIC_UPPER | PIC_LOWER }, - [PERF_COUNT_HW_CACHE_REFERENCES] = { 0x0208, PIC_UPPER | PIC_LOWER }, - [PERF_COUNT_HW_CACHE_MISSES] = { 0x0302, PIC_UPPER | PIC_LOWER }, - [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = { 0x0201, PIC_UPPER | PIC_LOWER }, - [PERF_COUNT_HW_BRANCH_MISSES] = { 0x0202, PIC_UPPER | PIC_LOWER }, -}; - -static const struct perf_event_map *niagara2_event_map(int event) -{ - return &niagara2_perfmon_event_map[event]; -} - -static const struct sparc_pmu niagara2_pmu = { - .event_map = niagara2_event_map, - .max_events = ARRAY_SIZE(niagara2_perfmon_event_map), - .upper_shift = 19, - .lower_shift = 6, - .event_mask = 0xfff, - .hv_bit = 0x8, - .irq_bit = 0x03, - .upper_nop = 0x220, - .lower_nop = 0x220, -}; - -static const struct sparc_pmu *sparc_pmu __read_mostly; - -static u64 event_encoding(u64 event, int idx) -{ - if (idx == PIC_UPPER_INDEX) - event <<= sparc_pmu->upper_shift; - else - event <<= sparc_pmu->lower_shift; - return event; -} - -static u64 mask_for_index(int idx) -{ - return event_encoding(sparc_pmu->event_mask, idx); -} - -static u64 nop_for_index(int idx) -{ - return event_encoding(idx == PIC_UPPER_INDEX ? - sparc_pmu->upper_nop : - sparc_pmu->lower_nop, idx); -} - -static inline void sparc_pmu_enable_counter(struct hw_perf_counter *hwc, - int idx) -{ - u64 val, mask = mask_for_index(idx); - - val = pcr_ops->read(); - pcr_ops->write((val & ~mask) | hwc->config); -} - -static inline void sparc_pmu_disable_counter(struct hw_perf_counter *hwc, - int idx) -{ - u64 mask = mask_for_index(idx); - u64 nop = nop_for_index(idx); - u64 val = pcr_ops->read(); - - pcr_ops->write((val & ~mask) | nop); -} - -void hw_perf_enable(void) -{ - struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); - u64 val; - int i; - - if (cpuc->enabled) - return; - - cpuc->enabled = 1; - barrier(); - - val = pcr_ops->read(); - - for (i = 0; i < MAX_HWCOUNTERS; i++) { - struct perf_counter *cp = cpuc->counters[i]; - struct hw_perf_counter *hwc; - - if (!cp) - continue; - hwc = &cp->hw; - val |= hwc->config_base; - } - - pcr_ops->write(val); -} - -void hw_perf_disable(void) -{ - struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); - u64 val; - - if (!cpuc->enabled) - return; - - cpuc->enabled = 0; - - val = pcr_ops->read(); - val &= ~(PCR_UTRACE | PCR_STRACE | - sparc_pmu->hv_bit | sparc_pmu->irq_bit); - pcr_ops->write(val); -} - -static u32 read_pmc(int idx) -{ - u64 val; - - read_pic(val); - if (idx == PIC_UPPER_INDEX) - val >>= 32; - - return val & 0xffffffff; -} - -static void write_pmc(int idx, u64 val) -{ - u64 shift, mask, pic; - - shift = 0; - if (idx == PIC_UPPER_INDEX) - shift = 32; - - mask = ((u64) 0xffffffff) << shift; - val <<= shift; - - read_pic(pic); - pic &= ~mask; - pic |= val; - write_pic(pic); -} - -static int sparc_perf_counter_set_period(struct perf_counter *counter, - struct hw_perf_counter *hwc, int idx) -{ - s64 left = atomic64_read(&hwc->period_left); - s64 period = hwc->sample_period; - int ret = 0; - - if (unlikely(left <= -period)) { - left = period; - atomic64_set(&hwc->period_left, left); - hwc->last_period = period; - ret = 1; - } - - if (unlikely(left <= 0)) { - left += period; - atomic64_set(&hwc->period_left, left); - hwc->last_period = period; - ret = 1; - } - if (left > MAX_PERIOD) - left = MAX_PERIOD; - - atomic64_set(&hwc->prev_count, (u64)-left); - - write_pmc(idx, (u64)(-left) & 0xffffffff); - - perf_counter_update_userpage(counter); - - return ret; -} - -static int sparc_pmu_enable(struct perf_counter *counter) -{ - struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); - struct hw_perf_counter *hwc = &counter->hw; - int idx = hwc->idx; - - if (test_and_set_bit(idx, cpuc->used_mask)) - return -EAGAIN; - - sparc_pmu_disable_counter(hwc, idx); - - cpuc->counters[idx] = counter; - set_bit(idx, cpuc->active_mask); - - sparc_perf_counter_set_period(counter, hwc, idx); - sparc_pmu_enable_counter(hwc, idx); - perf_counter_update_userpage(counter); - return 0; -} - -static u64 sparc_perf_counter_update(struct perf_counter *counter, - struct hw_perf_counter *hwc, int idx) -{ - int shift = 64 - 32; - u64 prev_raw_count, new_raw_count; - s64 delta; - -again: - prev_raw_count = atomic64_read(&hwc->prev_count); - new_raw_count = read_pmc(idx); - - if (atomic64_cmpxchg(&hwc->prev_count, prev_raw_count, - new_raw_count) != prev_raw_count) - goto again; - - delta = (new_raw_count << shift) - (prev_raw_count << shift); - delta >>= shift; - - atomic64_add(delta, &counter->count); - atomic64_sub(delta, &hwc->period_left); - - return new_raw_count; -} - -static void sparc_pmu_disable(struct perf_counter *counter) -{ - struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); - struct hw_perf_counter *hwc = &counter->hw; - int idx = hwc->idx; - - clear_bit(idx, cpuc->active_mask); - sparc_pmu_disable_counter(hwc, idx); - - barrier(); - - sparc_perf_counter_update(counter, hwc, idx); - cpuc->counters[idx] = NULL; - clear_bit(idx, cpuc->used_mask); - - perf_counter_update_userpage(counter); -} - -static void sparc_pmu_read(struct perf_counter *counter) -{ - struct hw_perf_counter *hwc = &counter->hw; - sparc_perf_counter_update(counter, hwc, hwc->idx); -} - -static void sparc_pmu_unthrottle(struct perf_counter *counter) -{ - struct hw_perf_counter *hwc = &counter->hw; - sparc_pmu_enable_counter(hwc, hwc->idx); -} - -static atomic_t active_counters = ATOMIC_INIT(0); -static DEFINE_MUTEX(pmc_grab_mutex); - -void perf_counter_grab_pmc(void) -{ - if (atomic_inc_not_zero(&active_counters)) - return; - - mutex_lock(&pmc_grab_mutex); - if (atomic_read(&active_counters) == 0) { - if (atomic_read(&nmi_active) > 0) { - on_each_cpu(stop_nmi_watchdog, NULL, 1); - BUG_ON(atomic_read(&nmi_active) != 0); - } - atomic_inc(&active_counters); - } - mutex_unlock(&pmc_grab_mutex); -} - -void perf_counter_release_pmc(void) -{ - if (atomic_dec_and_mutex_lock(&active_counters, &pmc_grab_mutex)) { - if (atomic_read(&nmi_active) == 0) - on_each_cpu(start_nmi_watchdog, NULL, 1); - mutex_unlock(&pmc_grab_mutex); - } -} - -static void hw_perf_counter_destroy(struct perf_counter *counter) -{ - perf_counter_release_pmc(); -} - -static int __hw_perf_counter_init(struct perf_counter *counter) -{ - struct perf_counter_attr *attr = &counter->attr; - struct hw_perf_counter *hwc = &counter->hw; - const struct perf_event_map *pmap; - u64 enc; - - if (atomic_read(&nmi_active) < 0) - return -ENODEV; - - if (attr->type != PERF_TYPE_HARDWARE) - return -EOPNOTSUPP; - - if (attr->config >= sparc_pmu->max_events) - return -EINVAL; - - perf_counter_grab_pmc(); - counter->destroy = hw_perf_counter_destroy; - - /* We save the enable bits in the config_base. So to - * turn off sampling just write 'config', and to enable - * things write 'config | config_base'. - */ - hwc->config_base = sparc_pmu->irq_bit; - if (!attr->exclude_user) - hwc->config_base |= PCR_UTRACE; - if (!attr->exclude_kernel) - hwc->config_base |= PCR_STRACE; - if (!attr->exclude_hv) - hwc->config_base |= sparc_pmu->hv_bit; - - if (!hwc->sample_period) { - hwc->sample_period = MAX_PERIOD; - hwc->last_period = hwc->sample_period; - atomic64_set(&hwc->period_left, hwc->sample_period); - } - - pmap = sparc_pmu->event_map(attr->config); - - enc = pmap->encoding; - if (pmap->pic_mask & PIC_UPPER) { - hwc->idx = PIC_UPPER_INDEX; - enc <<= sparc_pmu->upper_shift; - } else { - hwc->idx = PIC_LOWER_INDEX; - enc <<= sparc_pmu->lower_shift; - } - - hwc->config |= enc; - return 0; -} - -static const struct pmu pmu = { - .enable = sparc_pmu_enable, - .disable = sparc_pmu_disable, - .read = sparc_pmu_read, - .unthrottle = sparc_pmu_unthrottle, -}; - -const struct pmu *hw_perf_counter_init(struct perf_counter *counter) -{ - int err = __hw_perf_counter_init(counter); - - if (err) - return ERR_PTR(err); - return &pmu; -} - -void perf_counter_print_debug(void) -{ - unsigned long flags; - u64 pcr, pic; - int cpu; - - if (!sparc_pmu) - return; - - local_irq_save(flags); - - cpu = smp_processor_id(); - - pcr = pcr_ops->read(); - read_pic(pic); - - pr_info("\n"); - pr_info("CPU#%d: PCR[%016llx] PIC[%016llx]\n", - cpu, pcr, pic); - - local_irq_restore(flags); -} - -static int __kprobes perf_counter_nmi_handler(struct notifier_block *self, - unsigned long cmd, void *__args) -{ - struct die_args *args = __args; - struct perf_sample_data data; - struct cpu_hw_counters *cpuc; - struct pt_regs *regs; - int idx; - - if (!atomic_read(&active_counters)) - return NOTIFY_DONE; - - switch (cmd) { - case DIE_NMI: - break; - - default: - return NOTIFY_DONE; - } - - regs = args->regs; - - data.addr = 0; - - cpuc = &__get_cpu_var(cpu_hw_counters); - for (idx = 0; idx < MAX_HWCOUNTERS; idx++) { - struct perf_counter *counter = cpuc->counters[idx]; - struct hw_perf_counter *hwc; - u64 val; - - if (!test_bit(idx, cpuc->active_mask)) - continue; - hwc = &counter->hw; - val = sparc_perf_counter_update(counter, hwc, idx); - if (val & (1ULL << 31)) - continue; - - data.period = counter->hw.last_period; - if (!sparc_perf_counter_set_period(counter, hwc, idx)) - continue; - - if (perf_counter_overflow(counter, 1, &data, regs)) - sparc_pmu_disable_counter(hwc, idx); - } - - return NOTIFY_STOP; -} - -static __read_mostly struct notifier_block perf_counter_nmi_notifier = { - .notifier_call = perf_counter_nmi_handler, -}; - -static bool __init supported_pmu(void) -{ - if (!strcmp(sparc_pmu_type, "ultra3i")) { - sparc_pmu = &ultra3i_pmu; - return true; - } - if (!strcmp(sparc_pmu_type, "niagara2")) { - sparc_pmu = &niagara2_pmu; - return true; - } - return false; -} - -void __init init_hw_perf_counters(void) -{ - pr_info("Performance counters: "); - - if (!supported_pmu()) { - pr_cont("No support for PMU type '%s'\n", sparc_pmu_type); - return; - } - - pr_cont("Supported PMU type is '%s'\n", sparc_pmu_type); - - /* All sparc64 PMUs currently have 2 counters. But this simple - * driver only supports one active counter at a time. - */ - perf_max_counters = 1; - - register_die_notifier(&perf_counter_nmi_notifier); -} diff --git a/arch/sparc/kernel/perf_event.c b/arch/sparc/kernel/perf_event.c new file mode 100644 index 000000000000..2d6a1b10c81d --- /dev/null +++ b/arch/sparc/kernel/perf_event.c @@ -0,0 +1,556 @@ +/* Performance event support for sparc64. + * + * Copyright (C) 2009 David S. Miller + * + * This code is based almost entirely upon the x86 perf event + * code, which is: + * + * Copyright (C) 2008 Thomas Gleixner + * Copyright (C) 2008-2009 Red Hat, Inc., Ingo Molnar + * Copyright (C) 2009 Jaswinder Singh Rajput + * Copyright (C) 2009 Advanced Micro Devices, Inc., Robert Richter + * Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra + */ + +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +/* Sparc64 chips have two performance counters, 32-bits each, with + * overflow interrupts generated on transition from 0xffffffff to 0. + * The counters are accessed in one go using a 64-bit register. + * + * Both counters are controlled using a single control register. The + * only way to stop all sampling is to clear all of the context (user, + * supervisor, hypervisor) sampling enable bits. But these bits apply + * to both counters, thus the two counters can't be enabled/disabled + * individually. + * + * The control register has two event fields, one for each of the two + * counters. It's thus nearly impossible to have one counter going + * while keeping the other one stopped. Therefore it is possible to + * get overflow interrupts for counters not currently "in use" and + * that condition must be checked in the overflow interrupt handler. + * + * So we use a hack, in that we program inactive counters with the + * "sw_count0" and "sw_count1" events. These count how many times + * the instruction "sethi %hi(0xfc000), %g0" is executed. It's an + * unusual way to encode a NOP and therefore will not trigger in + * normal code. + */ + +#define MAX_HWEVENTS 2 +#define MAX_PERIOD ((1UL << 32) - 1) + +#define PIC_UPPER_INDEX 0 +#define PIC_LOWER_INDEX 1 + +struct cpu_hw_events { + struct perf_event *events[MAX_HWEVENTS]; + unsigned long used_mask[BITS_TO_LONGS(MAX_HWEVENTS)]; + unsigned long active_mask[BITS_TO_LONGS(MAX_HWEVENTS)]; + int enabled; +}; +DEFINE_PER_CPU(struct cpu_hw_events, cpu_hw_events) = { .enabled = 1, }; + +struct perf_event_map { + u16 encoding; + u8 pic_mask; +#define PIC_NONE 0x00 +#define PIC_UPPER 0x01 +#define PIC_LOWER 0x02 +}; + +struct sparc_pmu { + const struct perf_event_map *(*event_map)(int); + int max_events; + int upper_shift; + int lower_shift; + int event_mask; + int hv_bit; + int irq_bit; + int upper_nop; + int lower_nop; +}; + +static const struct perf_event_map ultra3i_perfmon_event_map[] = { + [PERF_COUNT_HW_CPU_CYCLES] = { 0x0000, PIC_UPPER | PIC_LOWER }, + [PERF_COUNT_HW_INSTRUCTIONS] = { 0x0001, PIC_UPPER | PIC_LOWER }, + [PERF_COUNT_HW_CACHE_REFERENCES] = { 0x0009, PIC_LOWER }, + [PERF_COUNT_HW_CACHE_MISSES] = { 0x0009, PIC_UPPER }, +}; + +static const struct perf_event_map *ultra3i_event_map(int event_id) +{ + return &ultra3i_perfmon_event_map[event_id]; +} + +static const struct sparc_pmu ultra3i_pmu = { + .event_map = ultra3i_event_map, + .max_events = ARRAY_SIZE(ultra3i_perfmon_event_map), + .upper_shift = 11, + .lower_shift = 4, + .event_mask = 0x3f, + .upper_nop = 0x1c, + .lower_nop = 0x14, +}; + +static const struct perf_event_map niagara2_perfmon_event_map[] = { + [PERF_COUNT_HW_CPU_CYCLES] = { 0x02ff, PIC_UPPER | PIC_LOWER }, + [PERF_COUNT_HW_INSTRUCTIONS] = { 0x02ff, PIC_UPPER | PIC_LOWER }, + [PERF_COUNT_HW_CACHE_REFERENCES] = { 0x0208, PIC_UPPER | PIC_LOWER }, + [PERF_COUNT_HW_CACHE_MISSES] = { 0x0302, PIC_UPPER | PIC_LOWER }, + [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = { 0x0201, PIC_UPPER | PIC_LOWER }, + [PERF_COUNT_HW_BRANCH_MISSES] = { 0x0202, PIC_UPPER | PIC_LOWER }, +}; + +static const struct perf_event_map *niagara2_event_map(int event_id) +{ + return &niagara2_perfmon_event_map[event_id]; +} + +static const struct sparc_pmu niagara2_pmu = { + .event_map = niagara2_event_map, + .max_events = ARRAY_SIZE(niagara2_perfmon_event_map), + .upper_shift = 19, + .lower_shift = 6, + .event_mask = 0xfff, + .hv_bit = 0x8, + .irq_bit = 0x03, + .upper_nop = 0x220, + .lower_nop = 0x220, +}; + +static const struct sparc_pmu *sparc_pmu __read_mostly; + +static u64 event_encoding(u64 event_id, int idx) +{ + if (idx == PIC_UPPER_INDEX) + event_id <<= sparc_pmu->upper_shift; + else + event_id <<= sparc_pmu->lower_shift; + return event_id; +} + +static u64 mask_for_index(int idx) +{ + return event_encoding(sparc_pmu->event_mask, idx); +} + +static u64 nop_for_index(int idx) +{ + return event_encoding(idx == PIC_UPPER_INDEX ? + sparc_pmu->upper_nop : + sparc_pmu->lower_nop, idx); +} + +static inline void sparc_pmu_enable_event(struct hw_perf_event *hwc, + int idx) +{ + u64 val, mask = mask_for_index(idx); + + val = pcr_ops->read(); + pcr_ops->write((val & ~mask) | hwc->config); +} + +static inline void sparc_pmu_disable_event(struct hw_perf_event *hwc, + int idx) +{ + u64 mask = mask_for_index(idx); + u64 nop = nop_for_index(idx); + u64 val = pcr_ops->read(); + + pcr_ops->write((val & ~mask) | nop); +} + +void hw_perf_enable(void) +{ + struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); + u64 val; + int i; + + if (cpuc->enabled) + return; + + cpuc->enabled = 1; + barrier(); + + val = pcr_ops->read(); + + for (i = 0; i < MAX_HWEVENTS; i++) { + struct perf_event *cp = cpuc->events[i]; + struct hw_perf_event *hwc; + + if (!cp) + continue; + hwc = &cp->hw; + val |= hwc->config_base; + } + + pcr_ops->write(val); +} + +void hw_perf_disable(void) +{ + struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); + u64 val; + + if (!cpuc->enabled) + return; + + cpuc->enabled = 0; + + val = pcr_ops->read(); + val &= ~(PCR_UTRACE | PCR_STRACE | + sparc_pmu->hv_bit | sparc_pmu->irq_bit); + pcr_ops->write(val); +} + +static u32 read_pmc(int idx) +{ + u64 val; + + read_pic(val); + if (idx == PIC_UPPER_INDEX) + val >>= 32; + + return val & 0xffffffff; +} + +static void write_pmc(int idx, u64 val) +{ + u64 shift, mask, pic; + + shift = 0; + if (idx == PIC_UPPER_INDEX) + shift = 32; + + mask = ((u64) 0xffffffff) << shift; + val <<= shift; + + read_pic(pic); + pic &= ~mask; + pic |= val; + write_pic(pic); +} + +static int sparc_perf_event_set_period(struct perf_event *event, + struct hw_perf_event *hwc, int idx) +{ + s64 left = atomic64_read(&hwc->period_left); + s64 period = hwc->sample_period; + int ret = 0; + + if (unlikely(left <= -period)) { + left = period; + atomic64_set(&hwc->period_left, left); + hwc->last_period = period; + ret = 1; + } + + if (unlikely(left <= 0)) { + left += period; + atomic64_set(&hwc->period_left, left); + hwc->last_period = period; + ret = 1; + } + if (left > MAX_PERIOD) + left = MAX_PERIOD; + + atomic64_set(&hwc->prev_count, (u64)-left); + + write_pmc(idx, (u64)(-left) & 0xffffffff); + + perf_event_update_userpage(event); + + return ret; +} + +static int sparc_pmu_enable(struct perf_event *event) +{ + struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); + struct hw_perf_event *hwc = &event->hw; + int idx = hwc->idx; + + if (test_and_set_bit(idx, cpuc->used_mask)) + return -EAGAIN; + + sparc_pmu_disable_event(hwc, idx); + + cpuc->events[idx] = event; + set_bit(idx, cpuc->active_mask); + + sparc_perf_event_set_period(event, hwc, idx); + sparc_pmu_enable_event(hwc, idx); + perf_event_update_userpage(event); + return 0; +} + +static u64 sparc_perf_event_update(struct perf_event *event, + struct hw_perf_event *hwc, int idx) +{ + int shift = 64 - 32; + u64 prev_raw_count, new_raw_count; + s64 delta; + +again: + prev_raw_count = atomic64_read(&hwc->prev_count); + new_raw_count = read_pmc(idx); + + if (atomic64_cmpxchg(&hwc->prev_count, prev_raw_count, + new_raw_count) != prev_raw_count) + goto again; + + delta = (new_raw_count << shift) - (prev_raw_count << shift); + delta >>= shift; + + atomic64_add(delta, &event->count); + atomic64_sub(delta, &hwc->period_left); + + return new_raw_count; +} + +static void sparc_pmu_disable(struct perf_event *event) +{ + struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); + struct hw_perf_event *hwc = &event->hw; + int idx = hwc->idx; + + clear_bit(idx, cpuc->active_mask); + sparc_pmu_disable_event(hwc, idx); + + barrier(); + + sparc_perf_event_update(event, hwc, idx); + cpuc->events[idx] = NULL; + clear_bit(idx, cpuc->used_mask); + + perf_event_update_userpage(event); +} + +static void sparc_pmu_read(struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + sparc_perf_event_update(event, hwc, hwc->idx); +} + +static void sparc_pmu_unthrottle(struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + sparc_pmu_enable_event(hwc, hwc->idx); +} + +static atomic_t active_events = ATOMIC_INIT(0); +static DEFINE_MUTEX(pmc_grab_mutex); + +void perf_event_grab_pmc(void) +{ + if (atomic_inc_not_zero(&active_events)) + return; + + mutex_lock(&pmc_grab_mutex); + if (atomic_read(&active_events) == 0) { + if (atomic_read(&nmi_active) > 0) { + on_each_cpu(stop_nmi_watchdog, NULL, 1); + BUG_ON(atomic_read(&nmi_active) != 0); + } + atomic_inc(&active_events); + } + mutex_unlock(&pmc_grab_mutex); +} + +void perf_event_release_pmc(void) +{ + if (atomic_dec_and_mutex_lock(&active_events, &pmc_grab_mutex)) { + if (atomic_read(&nmi_active) == 0) + on_each_cpu(start_nmi_watchdog, NULL, 1); + mutex_unlock(&pmc_grab_mutex); + } +} + +static void hw_perf_event_destroy(struct perf_event *event) +{ + perf_event_release_pmc(); +} + +static int __hw_perf_event_init(struct perf_event *event) +{ + struct perf_event_attr *attr = &event->attr; + struct hw_perf_event *hwc = &event->hw; + const struct perf_event_map *pmap; + u64 enc; + + if (atomic_read(&nmi_active) < 0) + return -ENODEV; + + if (attr->type != PERF_TYPE_HARDWARE) + return -EOPNOTSUPP; + + if (attr->config >= sparc_pmu->max_events) + return -EINVAL; + + perf_event_grab_pmc(); + event->destroy = hw_perf_event_destroy; + + /* We save the enable bits in the config_base. So to + * turn off sampling just write 'config', and to enable + * things write 'config | config_base'. + */ + hwc->config_base = sparc_pmu->irq_bit; + if (!attr->exclude_user) + hwc->config_base |= PCR_UTRACE; + if (!attr->exclude_kernel) + hwc->config_base |= PCR_STRACE; + if (!attr->exclude_hv) + hwc->config_base |= sparc_pmu->hv_bit; + + if (!hwc->sample_period) { + hwc->sample_period = MAX_PERIOD; + hwc->last_period = hwc->sample_period; + atomic64_set(&hwc->period_left, hwc->sample_period); + } + + pmap = sparc_pmu->event_map(attr->config); + + enc = pmap->encoding; + if (pmap->pic_mask & PIC_UPPER) { + hwc->idx = PIC_UPPER_INDEX; + enc <<= sparc_pmu->upper_shift; + } else { + hwc->idx = PIC_LOWER_INDEX; + enc <<= sparc_pmu->lower_shift; + } + + hwc->config |= enc; + return 0; +} + +static const struct pmu pmu = { + .enable = sparc_pmu_enable, + .disable = sparc_pmu_disable, + .read = sparc_pmu_read, + .unthrottle = sparc_pmu_unthrottle, +}; + +const struct pmu *hw_perf_event_init(struct perf_event *event) +{ + int err = __hw_perf_event_init(event); + + if (err) + return ERR_PTR(err); + return &pmu; +} + +void perf_event_print_debug(void) +{ + unsigned long flags; + u64 pcr, pic; + int cpu; + + if (!sparc_pmu) + return; + + local_irq_save(flags); + + cpu = smp_processor_id(); + + pcr = pcr_ops->read(); + read_pic(pic); + + pr_info("\n"); + pr_info("CPU#%d: PCR[%016llx] PIC[%016llx]\n", + cpu, pcr, pic); + + local_irq_restore(flags); +} + +static int __kprobes perf_event_nmi_handler(struct notifier_block *self, + unsigned long cmd, void *__args) +{ + struct die_args *args = __args; + struct perf_sample_data data; + struct cpu_hw_events *cpuc; + struct pt_regs *regs; + int idx; + + if (!atomic_read(&active_events)) + return NOTIFY_DONE; + + switch (cmd) { + case DIE_NMI: + break; + + default: + return NOTIFY_DONE; + } + + regs = args->regs; + + data.addr = 0; + + cpuc = &__get_cpu_var(cpu_hw_events); + for (idx = 0; idx < MAX_HWEVENTS; idx++) { + struct perf_event *event = cpuc->events[idx]; + struct hw_perf_event *hwc; + u64 val; + + if (!test_bit(idx, cpuc->active_mask)) + continue; + hwc = &event->hw; + val = sparc_perf_event_update(event, hwc, idx); + if (val & (1ULL << 31)) + continue; + + data.period = event->hw.last_period; + if (!sparc_perf_event_set_period(event, hwc, idx)) + continue; + + if (perf_event_overflow(event, 1, &data, regs)) + sparc_pmu_disable_event(hwc, idx); + } + + return NOTIFY_STOP; +} + +static __read_mostly struct notifier_block perf_event_nmi_notifier = { + .notifier_call = perf_event_nmi_handler, +}; + +static bool __init supported_pmu(void) +{ + if (!strcmp(sparc_pmu_type, "ultra3i")) { + sparc_pmu = &ultra3i_pmu; + return true; + } + if (!strcmp(sparc_pmu_type, "niagara2")) { + sparc_pmu = &niagara2_pmu; + return true; + } + return false; +} + +void __init init_hw_perf_events(void) +{ + pr_info("Performance events: "); + + if (!supported_pmu()) { + pr_cont("No support for PMU type '%s'\n", sparc_pmu_type); + return; + } + + pr_cont("Supported PMU type is '%s'\n", sparc_pmu_type); + + /* All sparc64 PMUs currently have 2 events. But this simple + * driver only supports one active event at a time. + */ + perf_max_events = 1; + + register_die_notifier(&perf_event_nmi_notifier); +} diff --git a/arch/sparc/kernel/systbls_32.S b/arch/sparc/kernel/systbls_32.S index 04181577cb65..0f1658d37490 100644 --- a/arch/sparc/kernel/systbls_32.S +++ b/arch/sparc/kernel/systbls_32.S @@ -82,5 +82,5 @@ sys_call_table: /*310*/ .long sys_utimensat, sys_signalfd, sys_timerfd_create, sys_eventfd, sys_fallocate /*315*/ .long sys_timerfd_settime, sys_timerfd_gettime, sys_signalfd4, sys_eventfd2, sys_epoll_create1 /*320*/ .long sys_dup3, sys_pipe2, sys_inotify_init1, sys_accept4, sys_preadv -/*325*/ .long sys_pwritev, sys_rt_tgsigqueueinfo, sys_perf_counter_open +/*325*/ .long sys_pwritev, sys_rt_tgsigqueueinfo, sys_perf_event_open diff --git a/arch/sparc/kernel/systbls_64.S b/arch/sparc/kernel/systbls_64.S index 91b06b7f7acf..009825f6e73c 100644 --- a/arch/sparc/kernel/systbls_64.S +++ b/arch/sparc/kernel/systbls_64.S @@ -83,7 +83,7 @@ sys_call_table32: /*310*/ .word compat_sys_utimensat, compat_sys_signalfd, sys_timerfd_create, sys_eventfd, compat_sys_fallocate .word compat_sys_timerfd_settime, compat_sys_timerfd_gettime, compat_sys_signalfd4, sys_eventfd2, sys_epoll_create1 /*320*/ .word sys_dup3, sys_pipe2, sys_inotify_init1, sys_accept4, compat_sys_preadv - .word compat_sys_pwritev, compat_sys_rt_tgsigqueueinfo, sys_perf_counter_open + .word compat_sys_pwritev, compat_sys_rt_tgsigqueueinfo, sys_perf_event_open #endif /* CONFIG_COMPAT */ @@ -158,4 +158,4 @@ sys_call_table: /*310*/ .word sys_utimensat, sys_signalfd, sys_timerfd_create, sys_eventfd, sys_fallocate .word sys_timerfd_settime, sys_timerfd_gettime, sys_signalfd4, sys_eventfd2, sys_epoll_create1 /*320*/ .word sys_dup3, sys_pipe2, sys_inotify_init1, sys_accept4, sys_preadv - .word sys_pwritev, sys_rt_tgsigqueueinfo, sys_perf_counter_open + .word sys_pwritev, sys_rt_tgsigqueueinfo, sys_perf_event_open diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 51c59015b280..e4ff5d1280ca 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -24,7 +24,7 @@ config X86 select HAVE_UNSTABLE_SCHED_CLOCK select HAVE_IDE select HAVE_OPROFILE - select HAVE_PERF_COUNTERS if (!M386 && !M486) + select HAVE_PERF_EVENTS if (!M386 && !M486) select HAVE_IOREMAP_PROT select HAVE_KPROBES select ARCH_WANT_OPTIONAL_GPIOLIB diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S index ba331bfd1112..74619c4f9fda 100644 --- a/arch/x86/ia32/ia32entry.S +++ b/arch/x86/ia32/ia32entry.S @@ -831,5 +831,5 @@ ia32_sys_call_table: .quad compat_sys_preadv .quad compat_sys_pwritev .quad compat_sys_rt_tgsigqueueinfo /* 335 */ - .quad sys_perf_counter_open + .quad sys_perf_event_open ia32_syscall_end: diff --git a/arch/x86/include/asm/entry_arch.h b/arch/x86/include/asm/entry_arch.h index 5e3f2044f0d3..f5693c81a1db 100644 --- a/arch/x86/include/asm/entry_arch.h +++ b/arch/x86/include/asm/entry_arch.h @@ -49,7 +49,7 @@ BUILD_INTERRUPT(apic_timer_interrupt,LOCAL_TIMER_VECTOR) BUILD_INTERRUPT(error_interrupt,ERROR_APIC_VECTOR) BUILD_INTERRUPT(spurious_interrupt,SPURIOUS_APIC_VECTOR) -#ifdef CONFIG_PERF_COUNTERS +#ifdef CONFIG_PERF_EVENTS BUILD_INTERRUPT(perf_pending_interrupt, LOCAL_PENDING_VECTOR) #endif diff --git a/arch/x86/include/asm/perf_counter.h b/arch/x86/include/asm/perf_counter.h deleted file mode 100644 index e7b7c938ae27..000000000000 --- a/arch/x86/include/asm/perf_counter.h +++ /dev/null @@ -1,108 +0,0 @@ -#ifndef _ASM_X86_PERF_COUNTER_H -#define _ASM_X86_PERF_COUNTER_H - -/* - * Performance counter hw details: - */ - -#define X86_PMC_MAX_GENERIC 8 -#define X86_PMC_MAX_FIXED 3 - -#define X86_PMC_IDX_GENERIC 0 -#define X86_PMC_IDX_FIXED 32 -#define X86_PMC_IDX_MAX 64 - -#define MSR_ARCH_PERFMON_PERFCTR0 0xc1 -#define MSR_ARCH_PERFMON_PERFCTR1 0xc2 - -#define MSR_ARCH_PERFMON_EVENTSEL0 0x186 -#define MSR_ARCH_PERFMON_EVENTSEL1 0x187 - -#define ARCH_PERFMON_EVENTSEL0_ENABLE (1 << 22) -#define ARCH_PERFMON_EVENTSEL_INT (1 << 20) -#define ARCH_PERFMON_EVENTSEL_OS (1 << 17) -#define ARCH_PERFMON_EVENTSEL_USR (1 << 16) - -/* - * Includes eventsel and unit mask as well: - */ -#define ARCH_PERFMON_EVENT_MASK 0xffff - -#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_SEL 0x3c -#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_UMASK (0x00 << 8) -#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX 0 -#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_PRESENT \ - (1 << (ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX)) - -#define ARCH_PERFMON_BRANCH_MISSES_RETIRED 6 - -/* - * Intel "Architectural Performance Monitoring" CPUID - * detection/enumeration details: - */ -union cpuid10_eax { - struct { - unsigned int version_id:8; - unsigned int num_counters:8; - unsigned int bit_width:8; - unsigned int mask_length:8; - } split; - unsigned int full; -}; - -union cpuid10_edx { - struct { - unsigned int num_counters_fixed:4; - unsigned int reserved:28; - } split; - unsigned int full; -}; - - -/* - * Fixed-purpose performance counters: - */ - -/* - * All 3 fixed-mode PMCs are configured via this single MSR: - */ -#define MSR_ARCH_PERFMON_FIXED_CTR_CTRL 0x38d - -/* - * The counts are available in three separate MSRs: - */ - -/* Instr_Retired.Any: */ -#define MSR_ARCH_PERFMON_FIXED_CTR0 0x309 -#define X86_PMC_IDX_FIXED_INSTRUCTIONS (X86_PMC_IDX_FIXED + 0) - -/* CPU_CLK_Unhalted.Core: */ -#define MSR_ARCH_PERFMON_FIXED_CTR1 0x30a -#define X86_PMC_IDX_FIXED_CPU_CYCLES (X86_PMC_IDX_FIXED + 1) - -/* CPU_CLK_Unhalted.Ref: */ -#define MSR_ARCH_PERFMON_FIXED_CTR2 0x30b -#define X86_PMC_IDX_FIXED_BUS_CYCLES (X86_PMC_IDX_FIXED + 2) - -/* - * We model BTS tracing as another fixed-mode PMC. - * - * We choose a value in the middle of the fixed counter range, since lower - * values are used by actual fixed counters and higher values are used - * to indicate other overflow conditions in the PERF_GLOBAL_STATUS msr. - */ -#define X86_PMC_IDX_FIXED_BTS (X86_PMC_IDX_FIXED + 16) - - -#ifdef CONFIG_PERF_COUNTERS -extern void init_hw_perf_counters(void); -extern void perf_counters_lapic_init(void); - -#define PERF_COUNTER_INDEX_OFFSET 0 - -#else -static inline void init_hw_perf_counters(void) { } -static inline void perf_counters_lapic_init(void) { } -#endif - -#endif /* _ASM_X86_PERF_COUNTER_H */ diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h new file mode 100644 index 000000000000..ad7ce3fd5065 --- /dev/null +++ b/arch/x86/include/asm/perf_event.h @@ -0,0 +1,108 @@ +#ifndef _ASM_X86_PERF_EVENT_H +#define _ASM_X86_PERF_EVENT_H + +/* + * Performance event hw details: + */ + +#define X86_PMC_MAX_GENERIC 8 +#define X86_PMC_MAX_FIXED 3 + +#define X86_PMC_IDX_GENERIC 0 +#define X86_PMC_IDX_FIXED 32 +#define X86_PMC_IDX_MAX 64 + +#define MSR_ARCH_PERFMON_PERFCTR0 0xc1 +#define MSR_ARCH_PERFMON_PERFCTR1 0xc2 + +#define MSR_ARCH_PERFMON_EVENTSEL0 0x186 +#define MSR_ARCH_PERFMON_EVENTSEL1 0x187 + +#define ARCH_PERFMON_EVENTSEL0_ENABLE (1 << 22) +#define ARCH_PERFMON_EVENTSEL_INT (1 << 20) +#define ARCH_PERFMON_EVENTSEL_OS (1 << 17) +#define ARCH_PERFMON_EVENTSEL_USR (1 << 16) + +/* + * Includes eventsel and unit mask as well: + */ +#define ARCH_PERFMON_EVENT_MASK 0xffff + +#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_SEL 0x3c +#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_UMASK (0x00 << 8) +#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX 0 +#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_PRESENT \ + (1 << (ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX)) + +#define ARCH_PERFMON_BRANCH_MISSES_RETIRED 6 + +/* + * Intel "Architectural Performance Monitoring" CPUID + * detection/enumeration details: + */ +union cpuid10_eax { + struct { + unsigned int version_id:8; + unsigned int num_events:8; + unsigned int bit_width:8; + unsigned int mask_length:8; + } split; + unsigned int full; +}; + +union cpuid10_edx { + struct { + unsigned int num_events_fixed:4; + unsigned int reserved:28; + } split; + unsigned int full; +}; + + +/* + * Fixed-purpose performance events: + */ + +/* + * All 3 fixed-mode PMCs are configured via this single MSR: + */ +#define MSR_ARCH_PERFMON_FIXED_CTR_CTRL 0x38d + +/* + * The counts are available in three separate MSRs: + */ + +/* Instr_Retired.Any: */ +#define MSR_ARCH_PERFMON_FIXED_CTR0 0x309 +#define X86_PMC_IDX_FIXED_INSTRUCTIONS (X86_PMC_IDX_FIXED + 0) + +/* CPU_CLK_Unhalted.Core: */ +#define MSR_ARCH_PERFMON_FIXED_CTR1 0x30a +#define X86_PMC_IDX_FIXED_CPU_CYCLES (X86_PMC_IDX_FIXED + 1) + +/* CPU_CLK_Unhalted.Ref: */ +#define MSR_ARCH_PERFMON_FIXED_CTR2 0x30b +#define X86_PMC_IDX_FIXED_BUS_CYCLES (X86_PMC_IDX_FIXED + 2) + +/* + * We model BTS tracing as another fixed-mode PMC. + * + * We choose a value in the middle of the fixed event range, since lower + * values are used by actual fixed events and higher values are used + * to indicate other overflow conditions in the PERF_GLOBAL_STATUS msr. + */ +#define X86_PMC_IDX_FIXED_BTS (X86_PMC_IDX_FIXED + 16) + + +#ifdef CONFIG_PERF_EVENTS +extern void init_hw_perf_events(void); +extern void perf_events_lapic_init(void); + +#define PERF_EVENT_INDEX_OFFSET 0 + +#else +static inline void init_hw_perf_events(void) { } +static inline void perf_events_lapic_init(void) { } +#endif + +#endif /* _ASM_X86_PERF_EVENT_H */ diff --git a/arch/x86/include/asm/unistd_32.h b/arch/x86/include/asm/unistd_32.h index 8deaada61bc8..6fb3c209a7e3 100644 --- a/arch/x86/include/asm/unistd_32.h +++ b/arch/x86/include/asm/unistd_32.h @@ -341,7 +341,7 @@ #define __NR_preadv 333 #define __NR_pwritev 334 #define __NR_rt_tgsigqueueinfo 335 -#define __NR_perf_counter_open 336 +#define __NR_perf_event_open 336 #ifdef __KERNEL__ diff --git a/arch/x86/include/asm/unistd_64.h b/arch/x86/include/asm/unistd_64.h index b9f3c60de5f7..8d3ad0adbc68 100644 --- a/arch/x86/include/asm/unistd_64.h +++ b/arch/x86/include/asm/unistd_64.h @@ -659,8 +659,8 @@ __SYSCALL(__NR_preadv, sys_preadv) __SYSCALL(__NR_pwritev, sys_pwritev) #define __NR_rt_tgsigqueueinfo 297 __SYSCALL(__NR_rt_tgsigqueueinfo, sys_rt_tgsigqueueinfo) -#define __NR_perf_counter_open 298 -__SYSCALL(__NR_perf_counter_open, sys_perf_counter_open) +#define __NR_perf_event_open 298 +__SYSCALL(__NR_perf_event_open, sys_perf_event_open) #ifndef __NO_STUBS #define __ARCH_WANT_OLD_READDIR diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index a34601f52987..754174d09deb 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -14,7 +14,7 @@ * Mikael Pettersson : PM converted to driver model. */ -#include +#include #include #include #include @@ -35,7 +35,7 @@ #include #include -#include +#include #include #include #include @@ -1189,7 +1189,7 @@ void __cpuinit setup_local_APIC(void) apic_write(APIC_ESR, 0); } #endif - perf_counters_lapic_init(); + perf_events_lapic_init(); preempt_disable(); diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile index 8dd30638fe44..68537e957a9b 100644 --- a/arch/x86/kernel/cpu/Makefile +++ b/arch/x86/kernel/cpu/Makefile @@ -27,7 +27,7 @@ obj-$(CONFIG_CPU_SUP_CENTAUR) += centaur.o obj-$(CONFIG_CPU_SUP_TRANSMETA_32) += transmeta.o obj-$(CONFIG_CPU_SUP_UMC_32) += umc.o -obj-$(CONFIG_PERF_COUNTERS) += perf_counter.o +obj-$(CONFIG_PERF_EVENTS) += perf_event.o obj-$(CONFIG_X86_MCE) += mcheck/ obj-$(CONFIG_MTRR) += mtrr/ diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 2fea97eccf77..cc25c2b4a567 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -13,7 +13,7 @@ #include #include -#include +#include #include #include #include @@ -869,7 +869,7 @@ void __init identify_boot_cpu(void) #else vgetcpu_set_mode(); #endif - init_hw_perf_counters(); + init_hw_perf_events(); } void __cpuinit identify_secondary_cpu(struct cpuinfo_x86 *c) diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c deleted file mode 100644 index b1f115696c84..000000000000 --- a/arch/x86/kernel/cpu/perf_counter.c +++ /dev/null @@ -1,2298 +0,0 @@ -/* - * Performance counter x86 architecture code - * - * Copyright (C) 2008 Thomas Gleixner - * Copyright (C) 2008-2009 Red Hat, Inc., Ingo Molnar - * Copyright (C) 2009 Jaswinder Singh Rajput - * Copyright (C) 2009 Advanced Micro Devices, Inc., Robert Richter - * Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra - * Copyright (C) 2009 Intel Corporation, - * - * For licencing details see kernel-base/COPYING - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include - -static u64 perf_counter_mask __read_mostly; - -/* The maximal number of PEBS counters: */ -#define MAX_PEBS_COUNTERS 4 - -/* The size of a BTS record in bytes: */ -#define BTS_RECORD_SIZE 24 - -/* The size of a per-cpu BTS buffer in bytes: */ -#define BTS_BUFFER_SIZE (BTS_RECORD_SIZE * 2048) - -/* The BTS overflow threshold in bytes from the end of the buffer: */ -#define BTS_OVFL_TH (BTS_RECORD_SIZE * 128) - - -/* - * Bits in the debugctlmsr controlling branch tracing. - */ -#define X86_DEBUGCTL_TR (1 << 6) -#define X86_DEBUGCTL_BTS (1 << 7) -#define X86_DEBUGCTL_BTINT (1 << 8) -#define X86_DEBUGCTL_BTS_OFF_OS (1 << 9) -#define X86_DEBUGCTL_BTS_OFF_USR (1 << 10) - -/* - * A debug store configuration. - * - * We only support architectures that use 64bit fields. - */ -struct debug_store { - u64 bts_buffer_base; - u64 bts_index; - u64 bts_absolute_maximum; - u64 bts_interrupt_threshold; - u64 pebs_buffer_base; - u64 pebs_index; - u64 pebs_absolute_maximum; - u64 pebs_interrupt_threshold; - u64 pebs_counter_reset[MAX_PEBS_COUNTERS]; -}; - -struct cpu_hw_counters { - struct perf_counter *counters[X86_PMC_IDX_MAX]; - unsigned long used_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; - unsigned long active_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; - unsigned long interrupts; - int enabled; - struct debug_store *ds; -}; - -/* - * struct x86_pmu - generic x86 pmu - */ -struct x86_pmu { - const char *name; - int version; - int (*handle_irq)(struct pt_regs *); - void (*disable_all)(void); - void (*enable_all)(void); - void (*enable)(struct hw_perf_counter *, int); - void (*disable)(struct hw_perf_counter *, int); - unsigned eventsel; - unsigned perfctr; - u64 (*event_map)(int); - u64 (*raw_event)(u64); - int max_events; - int num_counters; - int num_counters_fixed; - int counter_bits; - u64 counter_mask; - int apic; - u64 max_period; - u64 intel_ctrl; - void (*enable_bts)(u64 config); - void (*disable_bts)(void); -}; - -static struct x86_pmu x86_pmu __read_mostly; - -static DEFINE_PER_CPU(struct cpu_hw_counters, cpu_hw_counters) = { - .enabled = 1, -}; - -/* - * Not sure about some of these - */ -static const u64 p6_perfmon_event_map[] = -{ - [PERF_COUNT_HW_CPU_CYCLES] = 0x0079, - [PERF_COUNT_HW_INSTRUCTIONS] = 0x00c0, - [PERF_COUNT_HW_CACHE_REFERENCES] = 0x0f2e, - [PERF_COUNT_HW_CACHE_MISSES] = 0x012e, - [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x00c4, - [PERF_COUNT_HW_BRANCH_MISSES] = 0x00c5, - [PERF_COUNT_HW_BUS_CYCLES] = 0x0062, -}; - -static u64 p6_pmu_event_map(int hw_event) -{ - return p6_perfmon_event_map[hw_event]; -} - -/* - * Counter setting that is specified not to count anything. - * We use this to effectively disable a counter. - * - * L2_RQSTS with 0 MESI unit mask. - */ -#define P6_NOP_COUNTER 0x0000002EULL - -static u64 p6_pmu_raw_event(u64 hw_event) -{ -#define P6_EVNTSEL_EVENT_MASK 0x000000FFULL -#define P6_EVNTSEL_UNIT_MASK 0x0000FF00ULL -#define P6_EVNTSEL_EDGE_MASK 0x00040000ULL -#define P6_EVNTSEL_INV_MASK 0x00800000ULL -#define P6_EVNTSEL_COUNTER_MASK 0xFF000000ULL - -#define P6_EVNTSEL_MASK \ - (P6_EVNTSEL_EVENT_MASK | \ - P6_EVNTSEL_UNIT_MASK | \ - P6_EVNTSEL_EDGE_MASK | \ - P6_EVNTSEL_INV_MASK | \ - P6_EVNTSEL_COUNTER_MASK) - - return hw_event & P6_EVNTSEL_MASK; -} - - -/* - * Intel PerfMon v3. Used on Core2 and later. - */ -static const u64 intel_perfmon_event_map[] = -{ - [PERF_COUNT_HW_CPU_CYCLES] = 0x003c, - [PERF_COUNT_HW_INSTRUCTIONS] = 0x00c0, - [PERF_COUNT_HW_CACHE_REFERENCES] = 0x4f2e, - [PERF_COUNT_HW_CACHE_MISSES] = 0x412e, - [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x00c4, - [PERF_COUNT_HW_BRANCH_MISSES] = 0x00c5, - [PERF_COUNT_HW_BUS_CYCLES] = 0x013c, -}; - -static u64 intel_pmu_event_map(int hw_event) -{ - return intel_perfmon_event_map[hw_event]; -} - -/* - * Generalized hw caching related hw_event table, filled - * in on a per model basis. A value of 0 means - * 'not supported', -1 means 'hw_event makes no sense on - * this CPU', any other value means the raw hw_event - * ID. - */ - -#define C(x) PERF_COUNT_HW_CACHE_##x - -static u64 __read_mostly hw_cache_event_ids - [PERF_COUNT_HW_CACHE_MAX] - [PERF_COUNT_HW_CACHE_OP_MAX] - [PERF_COUNT_HW_CACHE_RESULT_MAX]; - -static const u64 nehalem_hw_cache_event_ids - [PERF_COUNT_HW_CACHE_MAX] - [PERF_COUNT_HW_CACHE_OP_MAX] - [PERF_COUNT_HW_CACHE_RESULT_MAX] = -{ - [ C(L1D) ] = { - [ C(OP_READ) ] = { - [ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI */ - [ C(RESULT_MISS) ] = 0x0140, /* L1D_CACHE_LD.I_STATE */ - }, - [ C(OP_WRITE) ] = { - [ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI */ - [ C(RESULT_MISS) ] = 0x0141, /* L1D_CACHE_ST.I_STATE */ - }, - [ C(OP_PREFETCH) ] = { - [ C(RESULT_ACCESS) ] = 0x014e, /* L1D_PREFETCH.REQUESTS */ - [ C(RESULT_MISS) ] = 0x024e, /* L1D_PREFETCH.MISS */ - }, - }, - [ C(L1I ) ] = { - [ C(OP_READ) ] = { - [ C(RESULT_ACCESS) ] = 0x0380, /* L1I.READS */ - [ C(RESULT_MISS) ] = 0x0280, /* L1I.MISSES */ - }, - [ C(OP_WRITE) ] = { - [ C(RESULT_ACCESS) ] = -1, - [ C(RESULT_MISS) ] = -1, - }, - [ C(OP_PREFETCH) ] = { - [ C(RESULT_ACCESS) ] = 0x0, - [ C(RESULT_MISS) ] = 0x0, - }, - }, - [ C(LL ) ] = { - [ C(OP_READ) ] = { - [ C(RESULT_ACCESS) ] = 0x0324, /* L2_RQSTS.LOADS */ - [ C(RESULT_MISS) ] = 0x0224, /* L2_RQSTS.LD_MISS */ - }, - [ C(OP_WRITE) ] = { - [ C(RESULT_ACCESS) ] = 0x0c24, /* L2_RQSTS.RFOS */ - [ C(RESULT_MISS) ] = 0x0824, /* L2_RQSTS.RFO_MISS */ - }, - [ C(OP_PREFETCH) ] = { - [ C(RESULT_ACCESS) ] = 0x4f2e, /* LLC Reference */ - [ C(RESULT_MISS) ] = 0x412e, /* LLC Misses */ - }, - }, - [ C(DTLB) ] = { - [ C(OP_READ) ] = { - [ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI (alias) */ - [ C(RESULT_MISS) ] = 0x0108, /* DTLB_LOAD_MISSES.ANY */ - }, - [ C(OP_WRITE) ] = { - [ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI (alias) */ - [ C(RESULT_MISS) ] = 0x010c, /* MEM_STORE_RETIRED.DTLB_MISS */ - }, - [ C(OP_PREFETCH) ] = { - [ C(RESULT_ACCESS) ] = 0x0, - [ C(RESULT_MISS) ] = 0x0, - }, - }, - [ C(ITLB) ] = { - [ C(OP_READ) ] = { - [ C(RESULT_ACCESS) ] = 0x01c0, /* INST_RETIRED.ANY_P */ - [ C(RESULT_MISS) ] = 0x20c8, /* ITLB_MISS_RETIRED */ - }, - [ C(OP_WRITE) ] = { - [ C(RESULT_ACCESS) ] = -1, - [ C(RESULT_MISS) ] = -1, - }, - [ C(OP_PREFETCH) ] = { - [ C(RESULT_ACCESS) ] = -1, - [ C(RESULT_MISS) ] = -1, - }, - }, - [ C(BPU ) ] = { - [ C(OP_READ) ] = { - [ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ALL_BRANCHES */ - [ C(RESULT_MISS) ] = 0x03e8, /* BPU_CLEARS.ANY */ - }, - [ C(OP_WRITE) ] = { - [ C(RESULT_ACCESS) ] = -1, - [ C(RESULT_MISS) ] = -1, - }, - [ C(OP_PREFETCH) ] = { - [ C(RESULT_ACCESS) ] = -1, - [ C(RESULT_MISS) ] = -1, - }, - }, -}; - -static const u64 core2_hw_cache_event_ids - [PERF_COUNT_HW_CACHE_MAX] - [PERF_COUNT_HW_CACHE_OP_MAX] - [PERF_COUNT_HW_CACHE_RESULT_MAX] = -{ - [ C(L1D) ] = { - [ C(OP_READ) ] = { - [ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI */ - [ C(RESULT_MISS) ] = 0x0140, /* L1D_CACHE_LD.I_STATE */ - }, - [ C(OP_WRITE) ] = { - [ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI */ - [ C(RESULT_MISS) ] = 0x0141, /* L1D_CACHE_ST.I_STATE */ - }, - [ C(OP_PREFETCH) ] = { - [ C(RESULT_ACCESS) ] = 0x104e, /* L1D_PREFETCH.REQUESTS */ - [ C(RESULT_MISS) ] = 0, - }, - }, - [ C(L1I ) ] = { - [ C(OP_READ) ] = { - [ C(RESULT_ACCESS) ] = 0x0080, /* L1I.READS */ - [ C(RESULT_MISS) ] = 0x0081, /* L1I.MISSES */ - }, - [ C(OP_WRITE) ] = { - [ C(RESULT_ACCESS) ] = -1, - [ C(RESULT_MISS) ] = -1, - }, - [ C(OP_PREFETCH) ] = { - [ C(RESULT_ACCESS) ] = 0, - [ C(RESULT_MISS) ] = 0, - }, - }, - [ C(LL ) ] = { - [ C(OP_READ) ] = { - [ C(RESULT_ACCESS) ] = 0x4f29, /* L2_LD.MESI */ - [ C(RESULT_MISS) ] = 0x4129, /* L2_LD.ISTATE */ - }, - [ C(OP_WRITE) ] = { - [ C(RESULT_ACCESS) ] = 0x4f2A, /* L2_ST.MESI */ - [ C(RESULT_MISS) ] = 0x412A, /* L2_ST.ISTATE */ - }, - [ C(OP_PREFETCH) ] = { - [ C(RESULT_ACCESS) ] = 0, - [ C(RESULT_MISS) ] = 0, - }, - }, - [ C(DTLB) ] = { - [ C(OP_READ) ] = { - [ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI (alias) */ - [ C(RESULT_MISS) ] = 0x0208, /* DTLB_MISSES.MISS_LD */ - }, - [ C(OP_WRITE) ] = { - [ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI (alias) */ - [ C(RESULT_MISS) ] = 0x0808, /* DTLB_MISSES.MISS_ST */ - }, - [ C(OP_PREFETCH) ] = { - [ C(RESULT_ACCESS) ] = 0, - [ C(RESULT_MISS) ] = 0, - }, - }, - [ C(ITLB) ] = { - [ C(OP_READ) ] = { - [ C(RESULT_ACCESS) ] = 0x00c0, /* INST_RETIRED.ANY_P */ - [ C(RESULT_MISS) ] = 0x1282, /* ITLBMISSES */ - }, - [ C(OP_WRITE) ] = { - [ C(RESULT_ACCESS) ] = -1, - [ C(RESULT_MISS) ] = -1, - }, - [ C(OP_PREFETCH) ] = { - [ C(RESULT_ACCESS) ] = -1, - [ C(RESULT_MISS) ] = -1, - }, - }, - [ C(BPU ) ] = { - [ C(OP_READ) ] = { - [ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ANY */ - [ C(RESULT_MISS) ] = 0x00c5, /* BP_INST_RETIRED.MISPRED */ - }, - [ C(OP_WRITE) ] = { - [ C(RESULT_ACCESS) ] = -1, - [ C(RESULT_MISS) ] = -1, - }, - [ C(OP_PREFETCH) ] = { - [ C(RESULT_ACCESS) ] = -1, - [ C(RESULT_MISS) ] = -1, - }, - }, -}; - -static const u64 atom_hw_cache_event_ids - [PERF_COUNT_HW_CACHE_MAX] - [PERF_COUNT_HW_CACHE_OP_MAX] - [PERF_COUNT_HW_CACHE_RESULT_MAX] = -{ - [ C(L1D) ] = { - [ C(OP_READ) ] = { - [ C(RESULT_ACCESS) ] = 0x2140, /* L1D_CACHE.LD */ - [ C(RESULT_MISS) ] = 0, - }, - [ C(OP_WRITE) ] = { - [ C(RESULT_ACCESS) ] = 0x2240, /* L1D_CACHE.ST */ - [ C(RESULT_MISS) ] = 0, - }, - [ C(OP_PREFETCH) ] = { - [ C(RESULT_ACCESS) ] = 0x0, - [ C(RESULT_MISS) ] = 0, - }, - }, - [ C(L1I ) ] = { - [ C(OP_READ) ] = { - [ C(RESULT_ACCESS) ] = 0x0380, /* L1I.READS */ - [ C(RESULT_MISS) ] = 0x0280, /* L1I.MISSES */ - }, - [ C(OP_WRITE) ] = { - [ C(RESULT_ACCESS) ] = -1, - [ C(RESULT_MISS) ] = -1, - }, - [ C(OP_PREFETCH) ] = { - [ C(RESULT_ACCESS) ] = 0, - [ C(RESULT_MISS) ] = 0, - }, - }, - [ C(LL ) ] = { - [ C(OP_READ) ] = { - [ C(RESULT_ACCESS) ] = 0x4f29, /* L2_LD.MESI */ - [ C(RESULT_MISS) ] = 0x4129, /* L2_LD.ISTATE */ - }, - [ C(OP_WRITE) ] = { - [ C(RESULT_ACCESS) ] = 0x4f2A, /* L2_ST.MESI */ - [ C(RESULT_MISS) ] = 0x412A, /* L2_ST.ISTATE */ - }, - [ C(OP_PREFETCH) ] = { - [ C(RESULT_ACCESS) ] = 0, - [ C(RESULT_MISS) ] = 0, - }, - }, - [ C(DTLB) ] = { - [ C(OP_READ) ] = { - [ C(RESULT_ACCESS) ] = 0x2140, /* L1D_CACHE_LD.MESI (alias) */ - [ C(RESULT_MISS) ] = 0x0508, /* DTLB_MISSES.MISS_LD */ - }, - [ C(OP_WRITE) ] = { - [ C(RESULT_ACCESS) ] = 0x2240, /* L1D_CACHE_ST.MESI (alias) */ - [ C(RESULT_MISS) ] = 0x0608, /* DTLB_MISSES.MISS_ST */ - }, - [ C(OP_PREFETCH) ] = { - [ C(RESULT_ACCESS) ] = 0, - [ C(RESULT_MISS) ] = 0, - }, - }, - [ C(ITLB) ] = { - [ C(OP_READ) ] = { - [ C(RESULT_ACCESS) ] = 0x00c0, /* INST_RETIRED.ANY_P */ - [ C(RESULT_MISS) ] = 0x0282, /* ITLB.MISSES */ - }, - [ C(OP_WRITE) ] = { - [ C(RESULT_ACCESS) ] = -1, - [ C(RESULT_MISS) ] = -1, - }, - [ C(OP_PREFETCH) ] = { - [ C(RESULT_ACCESS) ] = -1, - [ C(RESULT_MISS) ] = -1, - }, - }, - [ C(BPU ) ] = { - [ C(OP_READ) ] = { - [ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ANY */ - [ C(RESULT_MISS) ] = 0x00c5, /* BP_INST_RETIRED.MISPRED */ - }, - [ C(OP_WRITE) ] = { - [ C(RESULT_ACCESS) ] = -1, - [ C(RESULT_MISS) ] = -1, - }, - [ C(OP_PREFETCH) ] = { - [ C(RESULT_ACCESS) ] = -1, - [ C(RESULT_MISS) ] = -1, - }, - }, -}; - -static u64 intel_pmu_raw_event(u64 hw_event) -{ -#define CORE_EVNTSEL_EVENT_MASK 0x000000FFULL -#define CORE_EVNTSEL_UNIT_MASK 0x0000FF00ULL -#define CORE_EVNTSEL_EDGE_MASK 0x00040000ULL -#define CORE_EVNTSEL_INV_MASK 0x00800000ULL -#define CORE_EVNTSEL_COUNTER_MASK 0xFF000000ULL - -#define CORE_EVNTSEL_MASK \ - (CORE_EVNTSEL_EVENT_MASK | \ - CORE_EVNTSEL_UNIT_MASK | \ - CORE_EVNTSEL_EDGE_MASK | \ - CORE_EVNTSEL_INV_MASK | \ - CORE_EVNTSEL_COUNTER_MASK) - - return hw_event & CORE_EVNTSEL_MASK; -} - -static const u64 amd_hw_cache_event_ids - [PERF_COUNT_HW_CACHE_MAX] - [PERF_COUNT_HW_CACHE_OP_MAX] - [PERF_COUNT_HW_CACHE_RESULT_MAX] = -{ - [ C(L1D) ] = { - [ C(OP_READ) ] = { - [ C(RESULT_ACCESS) ] = 0x0040, /* Data Cache Accesses */ - [ C(RESULT_MISS) ] = 0x0041, /* Data Cache Misses */ - }, - [ C(OP_WRITE) ] = { - [ C(RESULT_ACCESS) ] = 0x0142, /* Data Cache Refills :system */ - [ C(RESULT_MISS) ] = 0, - }, - [ C(OP_PREFETCH) ] = { - [ C(RESULT_ACCESS) ] = 0x0267, /* Data Prefetcher :attempts */ - [ C(RESULT_MISS) ] = 0x0167, /* Data Prefetcher :cancelled */ - }, - }, - [ C(L1I ) ] = { - [ C(OP_READ) ] = { - [ C(RESULT_ACCESS) ] = 0x0080, /* Instruction cache fetches */ - [ C(RESULT_MISS) ] = 0x0081, /* Instruction cache misses */ - }, - [ C(OP_WRITE) ] = { - [ C(RESULT_ACCESS) ] = -1, - [ C(RESULT_MISS) ] = -1, - }, - [ C(OP_PREFETCH) ] = { - [ C(RESULT_ACCESS) ] = 0x014B, /* Prefetch Instructions :Load */ - [ C(RESULT_MISS) ] = 0, - }, - }, - [ C(LL ) ] = { - [ C(OP_READ) ] = { - [ C(RESULT_ACCESS) ] = 0x037D, /* Requests to L2 Cache :IC+DC */ - [ C(RESULT_MISS) ] = 0x037E, /* L2 Cache Misses : IC+DC */ - }, - [ C(OP_WRITE) ] = { - [ C(RESULT_ACCESS) ] = 0x017F, /* L2 Fill/Writeback */ - [ C(RESULT_MISS) ] = 0, - }, - [ C(OP_PREFETCH) ] = { - [ C(RESULT_ACCESS) ] = 0, - [ C(RESULT_MISS) ] = 0, - }, - }, - [ C(DTLB) ] = { - [ C(OP_READ) ] = { - [ C(RESULT_ACCESS) ] = 0x0040, /* Data Cache Accesses */ - [ C(RESULT_MISS) ] = 0x0046, /* L1 DTLB and L2 DLTB Miss */ - }, - [ C(OP_WRITE) ] = { - [ C(RESULT_ACCESS) ] = 0, - [ C(RESULT_MISS) ] = 0, - }, - [ C(OP_PREFETCH) ] = { - [ C(RESULT_ACCESS) ] = 0, - [ C(RESULT_MISS) ] = 0, - }, - }, - [ C(ITLB) ] = { - [ C(OP_READ) ] = { - [ C(RESULT_ACCESS) ] = 0x0080, /* Instruction fecthes */ - [ C(RESULT_MISS) ] = 0x0085, /* Instr. fetch ITLB misses */ - }, - [ C(OP_WRITE) ] = { - [ C(RESULT_ACCESS) ] = -1, - [ C(RESULT_MISS) ] = -1, - }, - [ C(OP_PREFETCH) ] = { - [ C(RESULT_ACCESS) ] = -1, - [ C(RESULT_MISS) ] = -1, - }, - }, - [ C(BPU ) ] = { - [ C(OP_READ) ] = { - [ C(RESULT_ACCESS) ] = 0x00c2, /* Retired Branch Instr. */ - [ C(RESULT_MISS) ] = 0x00c3, /* Retired Mispredicted BI */ - }, - [ C(OP_WRITE) ] = { - [ C(RESULT_ACCESS) ] = -1, - [ C(RESULT_MISS) ] = -1, - }, - [ C(OP_PREFETCH) ] = { - [ C(RESULT_ACCESS) ] = -1, - [ C(RESULT_MISS) ] = -1, - }, - }, -}; - -/* - * AMD Performance Monitor K7 and later. - */ -static const u64 amd_perfmon_event_map[] = -{ - [PERF_COUNT_HW_CPU_CYCLES] = 0x0076, - [PERF_COUNT_HW_INSTRUCTIONS] = 0x00c0, - [PERF_COUNT_HW_CACHE_REFERENCES] = 0x0080, - [PERF_COUNT_HW_CACHE_MISSES] = 0x0081, - [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x00c4, - [PERF_COUNT_HW_BRANCH_MISSES] = 0x00c5, -}; - -static u64 amd_pmu_event_map(int hw_event) -{ - return amd_perfmon_event_map[hw_event]; -} - -static u64 amd_pmu_raw_event(u64 hw_event) -{ -#define K7_EVNTSEL_EVENT_MASK 0x7000000FFULL -#define K7_EVNTSEL_UNIT_MASK 0x00000FF00ULL -#define K7_EVNTSEL_EDGE_MASK 0x000040000ULL -#define K7_EVNTSEL_INV_MASK 0x000800000ULL -#define K7_EVNTSEL_COUNTER_MASK 0x0FF000000ULL - -#define K7_EVNTSEL_MASK \ - (K7_EVNTSEL_EVENT_MASK | \ - K7_EVNTSEL_UNIT_MASK | \ - K7_EVNTSEL_EDGE_MASK | \ - K7_EVNTSEL_INV_MASK | \ - K7_EVNTSEL_COUNTER_MASK) - - return hw_event & K7_EVNTSEL_MASK; -} - -/* - * Propagate counter elapsed time into the generic counter. - * Can only be executed on the CPU where the counter is active. - * Returns the delta events processed. - */ -static u64 -x86_perf_counter_update(struct perf_counter *counter, - struct hw_perf_counter *hwc, int idx) -{ - int shift = 64 - x86_pmu.counter_bits; - u64 prev_raw_count, new_raw_count; - s64 delta; - - if (idx == X86_PMC_IDX_FIXED_BTS) - return 0; - - /* - * Careful: an NMI might modify the previous counter value. - * - * Our tactic to handle this is to first atomically read and - * exchange a new raw count - then add that new-prev delta - * count to the generic counter atomically: - */ -again: - prev_raw_count = atomic64_read(&hwc->prev_count); - rdmsrl(hwc->counter_base + idx, new_raw_count); - - if (atomic64_cmpxchg(&hwc->prev_count, prev_raw_count, - new_raw_count) != prev_raw_count) - goto again; - - /* - * Now we have the new raw value and have updated the prev - * timestamp already. We can now calculate the elapsed delta - * (counter-)time and add that to the generic counter. - * - * Careful, not all hw sign-extends above the physical width - * of the count. - */ - delta = (new_raw_count << shift) - (prev_raw_count << shift); - delta >>= shift; - - atomic64_add(delta, &counter->count); - atomic64_sub(delta, &hwc->period_left); - - return new_raw_count; -} - -static atomic_t active_counters; -static DEFINE_MUTEX(pmc_reserve_mutex); - -static bool reserve_pmc_hardware(void) -{ -#ifdef CONFIG_X86_LOCAL_APIC - int i; - - if (nmi_watchdog == NMI_LOCAL_APIC) - disable_lapic_nmi_watchdog(); - - for (i = 0; i < x86_pmu.num_counters; i++) { - if (!reserve_perfctr_nmi(x86_pmu.perfctr + i)) - goto perfctr_fail; - } - - for (i = 0; i < x86_pmu.num_counters; i++) { - if (!reserve_evntsel_nmi(x86_pmu.eventsel + i)) - goto eventsel_fail; - } -#endif - - return true; - -#ifdef CONFIG_X86_LOCAL_APIC -eventsel_fail: - for (i--; i >= 0; i--) - release_evntsel_nmi(x86_pmu.eventsel + i); - - i = x86_pmu.num_counters; - -perfctr_fail: - for (i--; i >= 0; i--) - release_perfctr_nmi(x86_pmu.perfctr + i); - - if (nmi_watchdog == NMI_LOCAL_APIC) - enable_lapic_nmi_watchdog(); - - return false; -#endif -} - -static void release_pmc_hardware(void) -{ -#ifdef CONFIG_X86_LOCAL_APIC - int i; - - for (i = 0; i < x86_pmu.num_counters; i++) { - release_perfctr_nmi(x86_pmu.perfctr + i); - release_evntsel_nmi(x86_pmu.eventsel + i); - } - - if (nmi_watchdog == NMI_LOCAL_APIC) - enable_lapic_nmi_watchdog(); -#endif -} - -static inline bool bts_available(void) -{ - return x86_pmu.enable_bts != NULL; -} - -static inline void init_debug_store_on_cpu(int cpu) -{ - struct debug_store *ds = per_cpu(cpu_hw_counters, cpu).ds; - - if (!ds) - return; - - wrmsr_on_cpu(cpu, MSR_IA32_DS_AREA, - (u32)((u64)(unsigned long)ds), - (u32)((u64)(unsigned long)ds >> 32)); -} - -static inline void fini_debug_store_on_cpu(int cpu) -{ - if (!per_cpu(cpu_hw_counters, cpu).ds) - return; - - wrmsr_on_cpu(cpu, MSR_IA32_DS_AREA, 0, 0); -} - -static void release_bts_hardware(void) -{ - int cpu; - - if (!bts_available()) - return; - - get_online_cpus(); - - for_each_online_cpu(cpu) - fini_debug_store_on_cpu(cpu); - - for_each_possible_cpu(cpu) { - struct debug_store *ds = per_cpu(cpu_hw_counters, cpu).ds; - - if (!ds) - continue; - - per_cpu(cpu_hw_counters, cpu).ds = NULL; - - kfree((void *)(unsigned long)ds->bts_buffer_base); - kfree(ds); - } - - put_online_cpus(); -} - -static int reserve_bts_hardware(void) -{ - int cpu, err = 0; - - if (!bts_available()) - return 0; - - get_online_cpus(); - - for_each_possible_cpu(cpu) { - struct debug_store *ds; - void *buffer; - - err = -ENOMEM; - buffer = kzalloc(BTS_BUFFER_SIZE, GFP_KERNEL); - if (unlikely(!buffer)) - break; - - ds = kzalloc(sizeof(*ds), GFP_KERNEL); - if (unlikely(!ds)) { - kfree(buffer); - break; - } - - ds->bts_buffer_base = (u64)(unsigned long)buffer; - ds->bts_index = ds->bts_buffer_base; - ds->bts_absolute_maximum = - ds->bts_buffer_base + BTS_BUFFER_SIZE; - ds->bts_interrupt_threshold = - ds->bts_absolute_maximum - BTS_OVFL_TH; - - per_cpu(cpu_hw_counters, cpu).ds = ds; - err = 0; - } - - if (err) - release_bts_hardware(); - else { - for_each_online_cpu(cpu) - init_debug_store_on_cpu(cpu); - } - - put_online_cpus(); - - return err; -} - -static void hw_perf_counter_destroy(struct perf_counter *counter) -{ - if (atomic_dec_and_mutex_lock(&active_counters, &pmc_reserve_mutex)) { - release_pmc_hardware(); - release_bts_hardware(); - mutex_unlock(&pmc_reserve_mutex); - } -} - -static inline int x86_pmu_initialized(void) -{ - return x86_pmu.handle_irq != NULL; -} - -static inline int -set_ext_hw_attr(struct hw_perf_counter *hwc, struct perf_counter_attr *attr) -{ - unsigned int cache_type, cache_op, cache_result; - u64 config, val; - - config = attr->config; - - cache_type = (config >> 0) & 0xff; - if (cache_type >= PERF_COUNT_HW_CACHE_MAX) - return -EINVAL; - - cache_op = (config >> 8) & 0xff; - if (cache_op >= PERF_COUNT_HW_CACHE_OP_MAX) - return -EINVAL; - - cache_result = (config >> 16) & 0xff; - if (cache_result >= PERF_COUNT_HW_CACHE_RESULT_MAX) - return -EINVAL; - - val = hw_cache_event_ids[cache_type][cache_op][cache_result]; - - if (val == 0) - return -ENOENT; - - if (val == -1) - return -EINVAL; - - hwc->config |= val; - - return 0; -} - -static void intel_pmu_enable_bts(u64 config) -{ - unsigned long debugctlmsr; - - debugctlmsr = get_debugctlmsr(); - - debugctlmsr |= X86_DEBUGCTL_TR; - debugctlmsr |= X86_DEBUGCTL_BTS; - debugctlmsr |= X86_DEBUGCTL_BTINT; - - if (!(config & ARCH_PERFMON_EVENTSEL_OS)) - debugctlmsr |= X86_DEBUGCTL_BTS_OFF_OS; - - if (!(config & ARCH_PERFMON_EVENTSEL_USR)) - debugctlmsr |= X86_DEBUGCTL_BTS_OFF_USR; - - update_debugctlmsr(debugctlmsr); -} - -static void intel_pmu_disable_bts(void) -{ - struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); - unsigned long debugctlmsr; - - if (!cpuc->ds) - return; - - debugctlmsr = get_debugctlmsr(); - - debugctlmsr &= - ~(X86_DEBUGCTL_TR | X86_DEBUGCTL_BTS | X86_DEBUGCTL_BTINT | - X86_DEBUGCTL_BTS_OFF_OS | X86_DEBUGCTL_BTS_OFF_USR); - - update_debugctlmsr(debugctlmsr); -} - -/* - * Setup the hardware configuration for a given attr_type - */ -static int __hw_perf_counter_init(struct perf_counter *counter) -{ - struct perf_counter_attr *attr = &counter->attr; - struct hw_perf_counter *hwc = &counter->hw; - u64 config; - int err; - - if (!x86_pmu_initialized()) - return -ENODEV; - - err = 0; - if (!atomic_inc_not_zero(&active_counters)) { - mutex_lock(&pmc_reserve_mutex); - if (atomic_read(&active_counters) == 0) { - if (!reserve_pmc_hardware()) - err = -EBUSY; - else - err = reserve_bts_hardware(); - } - if (!err) - atomic_inc(&active_counters); - mutex_unlock(&pmc_reserve_mutex); - } - if (err) - return err; - - counter->destroy = hw_perf_counter_destroy; - - /* - * Generate PMC IRQs: - * (keep 'enabled' bit clear for now) - */ - hwc->config = ARCH_PERFMON_EVENTSEL_INT; - - /* - * Count user and OS events unless requested not to. - */ - if (!attr->exclude_user) - hwc->config |= ARCH_PERFMON_EVENTSEL_USR; - if (!attr->exclude_kernel) - hwc->config |= ARCH_PERFMON_EVENTSEL_OS; - - if (!hwc->sample_period) { - hwc->sample_period = x86_pmu.max_period; - hwc->last_period = hwc->sample_period; - atomic64_set(&hwc->period_left, hwc->sample_period); - } else { - /* - * If we have a PMU initialized but no APIC - * interrupts, we cannot sample hardware - * counters (user-space has to fall back and - * sample via a hrtimer based software counter): - */ - if (!x86_pmu.apic) - return -EOPNOTSUPP; - } - - /* - * Raw hw_event type provide the config in the hw_event structure - */ - if (attr->type == PERF_TYPE_RAW) { - hwc->config |= x86_pmu.raw_event(attr->config); - return 0; - } - - if (attr->type == PERF_TYPE_HW_CACHE) - return set_ext_hw_attr(hwc, attr); - - if (attr->config >= x86_pmu.max_events) - return -EINVAL; - - /* - * The generic map: - */ - config = x86_pmu.event_map(attr->config); - - if (config == 0) - return -ENOENT; - - if (config == -1LL) - return -EINVAL; - - /* - * Branch tracing: - */ - if ((attr->config == PERF_COUNT_HW_BRANCH_INSTRUCTIONS) && - (hwc->sample_period == 1)) { - /* BTS is not supported by this architecture. */ - if (!bts_available()) - return -EOPNOTSUPP; - - /* BTS is currently only allowed for user-mode. */ - if (hwc->config & ARCH_PERFMON_EVENTSEL_OS) - return -EOPNOTSUPP; - } - - hwc->config |= config; - - return 0; -} - -static void p6_pmu_disable_all(void) -{ - struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); - u64 val; - - if (!cpuc->enabled) - return; - - cpuc->enabled = 0; - barrier(); - - /* p6 only has one enable register */ - rdmsrl(MSR_P6_EVNTSEL0, val); - val &= ~ARCH_PERFMON_EVENTSEL0_ENABLE; - wrmsrl(MSR_P6_EVNTSEL0, val); -} - -static void intel_pmu_disable_all(void) -{ - struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); - - if (!cpuc->enabled) - return; - - cpuc->enabled = 0; - barrier(); - - wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0); - - if (test_bit(X86_PMC_IDX_FIXED_BTS, cpuc->active_mask)) - intel_pmu_disable_bts(); -} - -static void amd_pmu_disable_all(void) -{ - struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); - int idx; - - if (!cpuc->enabled) - return; - - cpuc->enabled = 0; - /* - * ensure we write the disable before we start disabling the - * counters proper, so that amd_pmu_enable_counter() does the - * right thing. - */ - barrier(); - - for (idx = 0; idx < x86_pmu.num_counters; idx++) { - u64 val; - - if (!test_bit(idx, cpuc->active_mask)) - continue; - rdmsrl(MSR_K7_EVNTSEL0 + idx, val); - if (!(val & ARCH_PERFMON_EVENTSEL0_ENABLE)) - continue; - val &= ~ARCH_PERFMON_EVENTSEL0_ENABLE; - wrmsrl(MSR_K7_EVNTSEL0 + idx, val); - } -} - -void hw_perf_disable(void) -{ - if (!x86_pmu_initialized()) - return; - return x86_pmu.disable_all(); -} - -static void p6_pmu_enable_all(void) -{ - struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); - unsigned long val; - - if (cpuc->enabled) - return; - - cpuc->enabled = 1; - barrier(); - - /* p6 only has one enable register */ - rdmsrl(MSR_P6_EVNTSEL0, val); - val |= ARCH_PERFMON_EVENTSEL0_ENABLE; - wrmsrl(MSR_P6_EVNTSEL0, val); -} - -static void intel_pmu_enable_all(void) -{ - struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); - - if (cpuc->enabled) - return; - - cpuc->enabled = 1; - barrier(); - - wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, x86_pmu.intel_ctrl); - - if (test_bit(X86_PMC_IDX_FIXED_BTS, cpuc->active_mask)) { - struct perf_counter *counter = - cpuc->counters[X86_PMC_IDX_FIXED_BTS]; - - if (WARN_ON_ONCE(!counter)) - return; - - intel_pmu_enable_bts(counter->hw.config); - } -} - -static void amd_pmu_enable_all(void) -{ - struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); - int idx; - - if (cpuc->enabled) - return; - - cpuc->enabled = 1; - barrier(); - - for (idx = 0; idx < x86_pmu.num_counters; idx++) { - struct perf_counter *counter = cpuc->counters[idx]; - u64 val; - - if (!test_bit(idx, cpuc->active_mask)) - continue; - - val = counter->hw.config; - val |= ARCH_PERFMON_EVENTSEL0_ENABLE; - wrmsrl(MSR_K7_EVNTSEL0 + idx, val); - } -} - -void hw_perf_enable(void) -{ - if (!x86_pmu_initialized()) - return; - x86_pmu.enable_all(); -} - -static inline u64 intel_pmu_get_status(void) -{ - u64 status; - - rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status); - - return status; -} - -static inline void intel_pmu_ack_status(u64 ack) -{ - wrmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, ack); -} - -static inline void x86_pmu_enable_counter(struct hw_perf_counter *hwc, int idx) -{ - (void)checking_wrmsrl(hwc->config_base + idx, - hwc->config | ARCH_PERFMON_EVENTSEL0_ENABLE); -} - -static inline void x86_pmu_disable_counter(struct hw_perf_counter *hwc, int idx) -{ - (void)checking_wrmsrl(hwc->config_base + idx, hwc->config); -} - -static inline void -intel_pmu_disable_fixed(struct hw_perf_counter *hwc, int __idx) -{ - int idx = __idx - X86_PMC_IDX_FIXED; - u64 ctrl_val, mask; - - mask = 0xfULL << (idx * 4); - - rdmsrl(hwc->config_base, ctrl_val); - ctrl_val &= ~mask; - (void)checking_wrmsrl(hwc->config_base, ctrl_val); -} - -static inline void -p6_pmu_disable_counter(struct hw_perf_counter *hwc, int idx) -{ - struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); - u64 val = P6_NOP_COUNTER; - - if (cpuc->enabled) - val |= ARCH_PERFMON_EVENTSEL0_ENABLE; - - (void)checking_wrmsrl(hwc->config_base + idx, val); -} - -static inline void -intel_pmu_disable_counter(struct hw_perf_counter *hwc, int idx) -{ - if (unlikely(idx == X86_PMC_IDX_FIXED_BTS)) { - intel_pmu_disable_bts(); - return; - } - - if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) { - intel_pmu_disable_fixed(hwc, idx); - return; - } - - x86_pmu_disable_counter(hwc, idx); -} - -static inline void -amd_pmu_disable_counter(struct hw_perf_counter *hwc, int idx) -{ - x86_pmu_disable_counter(hwc, idx); -} - -static DEFINE_PER_CPU(u64 [X86_PMC_IDX_MAX], pmc_prev_left); - -/* - * Set the next IRQ period, based on the hwc->period_left value. - * To be called with the counter disabled in hw: - */ -static int -x86_perf_counter_set_period(struct perf_counter *counter, - struct hw_perf_counter *hwc, int idx) -{ - s64 left = atomic64_read(&hwc->period_left); - s64 period = hwc->sample_period; - int err, ret = 0; - - if (idx == X86_PMC_IDX_FIXED_BTS) - return 0; - - /* - * If we are way outside a reasoable range then just skip forward: - */ - if (unlikely(left <= -period)) { - left = period; - atomic64_set(&hwc->period_left, left); - hwc->last_period = period; - ret = 1; - } - - if (unlikely(left <= 0)) { - left += period; - atomic64_set(&hwc->period_left, left); - hwc->last_period = period; - ret = 1; - } - /* - * Quirk: certain CPUs dont like it if just 1 hw_event is left: - */ - if (unlikely(left < 2)) - left = 2; - - if (left > x86_pmu.max_period) - left = x86_pmu.max_period; - - per_cpu(pmc_prev_left[idx], smp_processor_id()) = left; - - /* - * The hw counter starts counting from this counter offset, - * mark it to be able to extra future deltas: - */ - atomic64_set(&hwc->prev_count, (u64)-left); - - err = checking_wrmsrl(hwc->counter_base + idx, - (u64)(-left) & x86_pmu.counter_mask); - - perf_counter_update_userpage(counter); - - return ret; -} - -static inline void -intel_pmu_enable_fixed(struct hw_perf_counter *hwc, int __idx) -{ - int idx = __idx - X86_PMC_IDX_FIXED; - u64 ctrl_val, bits, mask; - int err; - - /* - * Enable IRQ generation (0x8), - * and enable ring-3 counting (0x2) and ring-0 counting (0x1) - * if requested: - */ - bits = 0x8ULL; - if (hwc->config & ARCH_PERFMON_EVENTSEL_USR) - bits |= 0x2; - if (hwc->config & ARCH_PERFMON_EVENTSEL_OS) - bits |= 0x1; - bits <<= (idx * 4); - mask = 0xfULL << (idx * 4); - - rdmsrl(hwc->config_base, ctrl_val); - ctrl_val &= ~mask; - ctrl_val |= bits; - err = checking_wrmsrl(hwc->config_base, ctrl_val); -} - -static void p6_pmu_enable_counter(struct hw_perf_counter *hwc, int idx) -{ - struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); - u64 val; - - val = hwc->config; - if (cpuc->enabled) - val |= ARCH_PERFMON_EVENTSEL0_ENABLE; - - (void)checking_wrmsrl(hwc->config_base + idx, val); -} - - -static void intel_pmu_enable_counter(struct hw_perf_counter *hwc, int idx) -{ - if (unlikely(idx == X86_PMC_IDX_FIXED_BTS)) { - if (!__get_cpu_var(cpu_hw_counters).enabled) - return; - - intel_pmu_enable_bts(hwc->config); - return; - } - - if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) { - intel_pmu_enable_fixed(hwc, idx); - return; - } - - x86_pmu_enable_counter(hwc, idx); -} - -static void amd_pmu_enable_counter(struct hw_perf_counter *hwc, int idx) -{ - struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); - - if (cpuc->enabled) - x86_pmu_enable_counter(hwc, idx); -} - -static int -fixed_mode_idx(struct perf_counter *counter, struct hw_perf_counter *hwc) -{ - unsigned int hw_event; - - hw_event = hwc->config & ARCH_PERFMON_EVENT_MASK; - - if (unlikely((hw_event == - x86_pmu.event_map(PERF_COUNT_HW_BRANCH_INSTRUCTIONS)) && - (hwc->sample_period == 1))) - return X86_PMC_IDX_FIXED_BTS; - - if (!x86_pmu.num_counters_fixed) - return -1; - - if (unlikely(hw_event == x86_pmu.event_map(PERF_COUNT_HW_INSTRUCTIONS))) - return X86_PMC_IDX_FIXED_INSTRUCTIONS; - if (unlikely(hw_event == x86_pmu.event_map(PERF_COUNT_HW_CPU_CYCLES))) - return X86_PMC_IDX_FIXED_CPU_CYCLES; - if (unlikely(hw_event == x86_pmu.event_map(PERF_COUNT_HW_BUS_CYCLES))) - return X86_PMC_IDX_FIXED_BUS_CYCLES; - - return -1; -} - -/* - * Find a PMC slot for the freshly enabled / scheduled in counter: - */ -static int x86_pmu_enable(struct perf_counter *counter) -{ - struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); - struct hw_perf_counter *hwc = &counter->hw; - int idx; - - idx = fixed_mode_idx(counter, hwc); - if (idx == X86_PMC_IDX_FIXED_BTS) { - /* BTS is already occupied. */ - if (test_and_set_bit(idx, cpuc->used_mask)) - return -EAGAIN; - - hwc->config_base = 0; - hwc->counter_base = 0; - hwc->idx = idx; - } else if (idx >= 0) { - /* - * Try to get the fixed counter, if that is already taken - * then try to get a generic counter: - */ - if (test_and_set_bit(idx, cpuc->used_mask)) - goto try_generic; - - hwc->config_base = MSR_ARCH_PERFMON_FIXED_CTR_CTRL; - /* - * We set it so that counter_base + idx in wrmsr/rdmsr maps to - * MSR_ARCH_PERFMON_FIXED_CTR0 ... CTR2: - */ - hwc->counter_base = - MSR_ARCH_PERFMON_FIXED_CTR0 - X86_PMC_IDX_FIXED; - hwc->idx = idx; - } else { - idx = hwc->idx; - /* Try to get the previous generic counter again */ - if (test_and_set_bit(idx, cpuc->used_mask)) { -try_generic: - idx = find_first_zero_bit(cpuc->used_mask, - x86_pmu.num_counters); - if (idx == x86_pmu.num_counters) - return -EAGAIN; - - set_bit(idx, cpuc->used_mask); - hwc->idx = idx; - } - hwc->config_base = x86_pmu.eventsel; - hwc->counter_base = x86_pmu.perfctr; - } - - perf_counters_lapic_init(); - - x86_pmu.disable(hwc, idx); - - cpuc->counters[idx] = counter; - set_bit(idx, cpuc->active_mask); - - x86_perf_counter_set_period(counter, hwc, idx); - x86_pmu.enable(hwc, idx); - - perf_counter_update_userpage(counter); - - return 0; -} - -static void x86_pmu_unthrottle(struct perf_counter *counter) -{ - struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); - struct hw_perf_counter *hwc = &counter->hw; - - if (WARN_ON_ONCE(hwc->idx >= X86_PMC_IDX_MAX || - cpuc->counters[hwc->idx] != counter)) - return; - - x86_pmu.enable(hwc, hwc->idx); -} - -void perf_counter_print_debug(void) -{ - u64 ctrl, status, overflow, pmc_ctrl, pmc_count, prev_left, fixed; - struct cpu_hw_counters *cpuc; - unsigned long flags; - int cpu, idx; - - if (!x86_pmu.num_counters) - return; - - local_irq_save(flags); - - cpu = smp_processor_id(); - cpuc = &per_cpu(cpu_hw_counters, cpu); - - if (x86_pmu.version >= 2) { - rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, ctrl); - rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status); - rdmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, overflow); - rdmsrl(MSR_ARCH_PERFMON_FIXED_CTR_CTRL, fixed); - - pr_info("\n"); - pr_info("CPU#%d: ctrl: %016llx\n", cpu, ctrl); - pr_info("CPU#%d: status: %016llx\n", cpu, status); - pr_info("CPU#%d: overflow: %016llx\n", cpu, overflow); - pr_info("CPU#%d: fixed: %016llx\n", cpu, fixed); - } - pr_info("CPU#%d: used: %016llx\n", cpu, *(u64 *)cpuc->used_mask); - - for (idx = 0; idx < x86_pmu.num_counters; idx++) { - rdmsrl(x86_pmu.eventsel + idx, pmc_ctrl); - rdmsrl(x86_pmu.perfctr + idx, pmc_count); - - prev_left = per_cpu(pmc_prev_left[idx], cpu); - - pr_info("CPU#%d: gen-PMC%d ctrl: %016llx\n", - cpu, idx, pmc_ctrl); - pr_info("CPU#%d: gen-PMC%d count: %016llx\n", - cpu, idx, pmc_count); - pr_info("CPU#%d: gen-PMC%d left: %016llx\n", - cpu, idx, prev_left); - } - for (idx = 0; idx < x86_pmu.num_counters_fixed; idx++) { - rdmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, pmc_count); - - pr_info("CPU#%d: fixed-PMC%d count: %016llx\n", - cpu, idx, pmc_count); - } - local_irq_restore(flags); -} - -static void intel_pmu_drain_bts_buffer(struct cpu_hw_counters *cpuc) -{ - struct debug_store *ds = cpuc->ds; - struct bts_record { - u64 from; - u64 to; - u64 flags; - }; - struct perf_counter *counter = cpuc->counters[X86_PMC_IDX_FIXED_BTS]; - struct bts_record *at, *top; - struct perf_output_handle handle; - struct perf_event_header header; - struct perf_sample_data data; - struct pt_regs regs; - - if (!counter) - return; - - if (!ds) - return; - - at = (struct bts_record *)(unsigned long)ds->bts_buffer_base; - top = (struct bts_record *)(unsigned long)ds->bts_index; - - if (top <= at) - return; - - ds->bts_index = ds->bts_buffer_base; - - - data.period = counter->hw.last_period; - data.addr = 0; - regs.ip = 0; - - /* - * Prepare a generic sample, i.e. fill in the invariant fields. - * We will overwrite the from and to address before we output - * the sample. - */ - perf_prepare_sample(&header, &data, counter, ®s); - - if (perf_output_begin(&handle, counter, - header.size * (top - at), 1, 1)) - return; - - for (; at < top; at++) { - data.ip = at->from; - data.addr = at->to; - - perf_output_sample(&handle, &header, &data, counter); - } - - perf_output_end(&handle); - - /* There's new data available. */ - counter->hw.interrupts++; - counter->pending_kill = POLL_IN; -} - -static void x86_pmu_disable(struct perf_counter *counter) -{ - struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); - struct hw_perf_counter *hwc = &counter->hw; - int idx = hwc->idx; - - /* - * Must be done before we disable, otherwise the nmi handler - * could reenable again: - */ - clear_bit(idx, cpuc->active_mask); - x86_pmu.disable(hwc, idx); - - /* - * Make sure the cleared pointer becomes visible before we - * (potentially) free the counter: - */ - barrier(); - - /* - * Drain the remaining delta count out of a counter - * that we are disabling: - */ - x86_perf_counter_update(counter, hwc, idx); - - /* Drain the remaining BTS records. */ - if (unlikely(idx == X86_PMC_IDX_FIXED_BTS)) - intel_pmu_drain_bts_buffer(cpuc); - - cpuc->counters[idx] = NULL; - clear_bit(idx, cpuc->used_mask); - - perf_counter_update_userpage(counter); -} - -/* - * Save and restart an expired counter. Called by NMI contexts, - * so it has to be careful about preempting normal counter ops: - */ -static int intel_pmu_save_and_restart(struct perf_counter *counter) -{ - struct hw_perf_counter *hwc = &counter->hw; - int idx = hwc->idx; - int ret; - - x86_perf_counter_update(counter, hwc, idx); - ret = x86_perf_counter_set_period(counter, hwc, idx); - - if (counter->state == PERF_COUNTER_STATE_ACTIVE) - intel_pmu_enable_counter(hwc, idx); - - return ret; -} - -static void intel_pmu_reset(void) -{ - struct debug_store *ds = __get_cpu_var(cpu_hw_counters).ds; - unsigned long flags; - int idx; - - if (!x86_pmu.num_counters) - return; - - local_irq_save(flags); - - printk("clearing PMU state on CPU#%d\n", smp_processor_id()); - - for (idx = 0; idx < x86_pmu.num_counters; idx++) { - checking_wrmsrl(x86_pmu.eventsel + idx, 0ull); - checking_wrmsrl(x86_pmu.perfctr + idx, 0ull); - } - for (idx = 0; idx < x86_pmu.num_counters_fixed; idx++) { - checking_wrmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, 0ull); - } - if (ds) - ds->bts_index = ds->bts_buffer_base; - - local_irq_restore(flags); -} - -static int p6_pmu_handle_irq(struct pt_regs *regs) -{ - struct perf_sample_data data; - struct cpu_hw_counters *cpuc; - struct perf_counter *counter; - struct hw_perf_counter *hwc; - int idx, handled = 0; - u64 val; - - data.addr = 0; - - cpuc = &__get_cpu_var(cpu_hw_counters); - - for (idx = 0; idx < x86_pmu.num_counters; idx++) { - if (!test_bit(idx, cpuc->active_mask)) - continue; - - counter = cpuc->counters[idx]; - hwc = &counter->hw; - - val = x86_perf_counter_update(counter, hwc, idx); - if (val & (1ULL << (x86_pmu.counter_bits - 1))) - continue; - - /* - * counter overflow - */ - handled = 1; - data.period = counter->hw.last_period; - - if (!x86_perf_counter_set_period(counter, hwc, idx)) - continue; - - if (perf_counter_overflow(counter, 1, &data, regs)) - p6_pmu_disable_counter(hwc, idx); - } - - if (handled) - inc_irq_stat(apic_perf_irqs); - - return handled; -} - -/* - * This handler is triggered by the local APIC, so the APIC IRQ handling - * rules apply: - */ -static int intel_pmu_handle_irq(struct pt_regs *regs) -{ - struct perf_sample_data data; - struct cpu_hw_counters *cpuc; - int bit, loops; - u64 ack, status; - - data.addr = 0; - - cpuc = &__get_cpu_var(cpu_hw_counters); - - perf_disable(); - intel_pmu_drain_bts_buffer(cpuc); - status = intel_pmu_get_status(); - if (!status) { - perf_enable(); - return 0; - } - - loops = 0; -again: - if (++loops > 100) { - WARN_ONCE(1, "perfcounters: irq loop stuck!\n"); - perf_counter_print_debug(); - intel_pmu_reset(); - perf_enable(); - return 1; - } - - inc_irq_stat(apic_perf_irqs); - ack = status; - for_each_bit(bit, (unsigned long *)&status, X86_PMC_IDX_MAX) { - struct perf_counter *counter = cpuc->counters[bit]; - - clear_bit(bit, (unsigned long *) &status); - if (!test_bit(bit, cpuc->active_mask)) - continue; - - if (!intel_pmu_save_and_restart(counter)) - continue; - - data.period = counter->hw.last_period; - - if (perf_counter_overflow(counter, 1, &data, regs)) - intel_pmu_disable_counter(&counter->hw, bit); - } - - intel_pmu_ack_status(ack); - - /* - * Repeat if there is more work to be done: - */ - status = intel_pmu_get_status(); - if (status) - goto again; - - perf_enable(); - - return 1; -} - -static int amd_pmu_handle_irq(struct pt_regs *regs) -{ - struct perf_sample_data data; - struct cpu_hw_counters *cpuc; - struct perf_counter *counter; - struct hw_perf_counter *hwc; - int idx, handled = 0; - u64 val; - - data.addr = 0; - - cpuc = &__get_cpu_var(cpu_hw_counters); - - for (idx = 0; idx < x86_pmu.num_counters; idx++) { - if (!test_bit(idx, cpuc->active_mask)) - continue; - - counter = cpuc->counters[idx]; - hwc = &counter->hw; - - val = x86_perf_counter_update(counter, hwc, idx); - if (val & (1ULL << (x86_pmu.counter_bits - 1))) - continue; - - /* - * counter overflow - */ - handled = 1; - data.period = counter->hw.last_period; - - if (!x86_perf_counter_set_period(counter, hwc, idx)) - continue; - - if (perf_counter_overflow(counter, 1, &data, regs)) - amd_pmu_disable_counter(hwc, idx); - } - - if (handled) - inc_irq_stat(apic_perf_irqs); - - return handled; -} - -void smp_perf_pending_interrupt(struct pt_regs *regs) -{ - irq_enter(); - ack_APIC_irq(); - inc_irq_stat(apic_pending_irqs); - perf_counter_do_pending(); - irq_exit(); -} - -void set_perf_counter_pending(void) -{ -#ifdef CONFIG_X86_LOCAL_APIC - apic->send_IPI_self(LOCAL_PENDING_VECTOR); -#endif -} - -void perf_counters_lapic_init(void) -{ -#ifdef CONFIG_X86_LOCAL_APIC - if (!x86_pmu.apic || !x86_pmu_initialized()) - return; - - /* - * Always use NMI for PMU - */ - apic_write(APIC_LVTPC, APIC_DM_NMI); -#endif -} - -static int __kprobes -perf_counter_nmi_handler(struct notifier_block *self, - unsigned long cmd, void *__args) -{ - struct die_args *args = __args; - struct pt_regs *regs; - - if (!atomic_read(&active_counters)) - return NOTIFY_DONE; - - switch (cmd) { - case DIE_NMI: - case DIE_NMI_IPI: - break; - - default: - return NOTIFY_DONE; - } - - regs = args->regs; - -#ifdef CONFIG_X86_LOCAL_APIC - apic_write(APIC_LVTPC, APIC_DM_NMI); -#endif - /* - * Can't rely on the handled return value to say it was our NMI, two - * counters could trigger 'simultaneously' raising two back-to-back NMIs. - * - * If the first NMI handles both, the latter will be empty and daze - * the CPU. - */ - x86_pmu.handle_irq(regs); - - return NOTIFY_STOP; -} - -static __read_mostly struct notifier_block perf_counter_nmi_notifier = { - .notifier_call = perf_counter_nmi_handler, - .next = NULL, - .priority = 1 -}; - -static struct x86_pmu p6_pmu = { - .name = "p6", - .handle_irq = p6_pmu_handle_irq, - .disable_all = p6_pmu_disable_all, - .enable_all = p6_pmu_enable_all, - .enable = p6_pmu_enable_counter, - .disable = p6_pmu_disable_counter, - .eventsel = MSR_P6_EVNTSEL0, - .perfctr = MSR_P6_PERFCTR0, - .event_map = p6_pmu_event_map, - .raw_event = p6_pmu_raw_event, - .max_events = ARRAY_SIZE(p6_perfmon_event_map), - .apic = 1, - .max_period = (1ULL << 31) - 1, - .version = 0, - .num_counters = 2, - /* - * Counters have 40 bits implemented. However they are designed such - * that bits [32-39] are sign extensions of bit 31. As such the - * effective width of a counter for P6-like PMU is 32 bits only. - * - * See IA-32 Intel Architecture Software developer manual Vol 3B - */ - .counter_bits = 32, - .counter_mask = (1ULL << 32) - 1, -}; - -static struct x86_pmu intel_pmu = { - .name = "Intel", - .handle_irq = intel_pmu_handle_irq, - .disable_all = intel_pmu_disable_all, - .enable_all = intel_pmu_enable_all, - .enable = intel_pmu_enable_counter, - .disable = intel_pmu_disable_counter, - .eventsel = MSR_ARCH_PERFMON_EVENTSEL0, - .perfctr = MSR_ARCH_PERFMON_PERFCTR0, - .event_map = intel_pmu_event_map, - .raw_event = intel_pmu_raw_event, - .max_events = ARRAY_SIZE(intel_perfmon_event_map), - .apic = 1, - /* - * Intel PMCs cannot be accessed sanely above 32 bit width, - * so we install an artificial 1<<31 period regardless of - * the generic counter period: - */ - .max_period = (1ULL << 31) - 1, - .enable_bts = intel_pmu_enable_bts, - .disable_bts = intel_pmu_disable_bts, -}; - -static struct x86_pmu amd_pmu = { - .name = "AMD", - .handle_irq = amd_pmu_handle_irq, - .disable_all = amd_pmu_disable_all, - .enable_all = amd_pmu_enable_all, - .enable = amd_pmu_enable_counter, - .disable = amd_pmu_disable_counter, - .eventsel = MSR_K7_EVNTSEL0, - .perfctr = MSR_K7_PERFCTR0, - .event_map = amd_pmu_event_map, - .raw_event = amd_pmu_raw_event, - .max_events = ARRAY_SIZE(amd_perfmon_event_map), - .num_counters = 4, - .counter_bits = 48, - .counter_mask = (1ULL << 48) - 1, - .apic = 1, - /* use highest bit to detect overflow */ - .max_period = (1ULL << 47) - 1, -}; - -static int p6_pmu_init(void) -{ - switch (boot_cpu_data.x86_model) { - case 1: - case 3: /* Pentium Pro */ - case 5: - case 6: /* Pentium II */ - case 7: - case 8: - case 11: /* Pentium III */ - break; - case 9: - case 13: - /* Pentium M */ - break; - default: - pr_cont("unsupported p6 CPU model %d ", - boot_cpu_data.x86_model); - return -ENODEV; - } - - x86_pmu = p6_pmu; - - if (!cpu_has_apic) { - pr_info("no APIC, boot with the \"lapic\" boot parameter to force-enable it.\n"); - pr_info("no hardware sampling interrupt available.\n"); - x86_pmu.apic = 0; - } - - return 0; -} - -static int intel_pmu_init(void) -{ - union cpuid10_edx edx; - union cpuid10_eax eax; - unsigned int unused; - unsigned int ebx; - int version; - - if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) { - /* check for P6 processor family */ - if (boot_cpu_data.x86 == 6) { - return p6_pmu_init(); - } else { - return -ENODEV; - } - } - - /* - * Check whether the Architectural PerfMon supports - * Branch Misses Retired hw_event or not. - */ - cpuid(10, &eax.full, &ebx, &unused, &edx.full); - if (eax.split.mask_length <= ARCH_PERFMON_BRANCH_MISSES_RETIRED) - return -ENODEV; - - version = eax.split.version_id; - if (version < 2) - return -ENODEV; - - x86_pmu = intel_pmu; - x86_pmu.version = version; - x86_pmu.num_counters = eax.split.num_counters; - x86_pmu.counter_bits = eax.split.bit_width; - x86_pmu.counter_mask = (1ULL << eax.split.bit_width) - 1; - - /* - * Quirk: v2 perfmon does not report fixed-purpose counters, so - * assume at least 3 counters: - */ - x86_pmu.num_counters_fixed = max((int)edx.split.num_counters_fixed, 3); - - /* - * Install the hw-cache-events table: - */ - switch (boot_cpu_data.x86_model) { - case 15: /* original 65 nm celeron/pentium/core2/xeon, "Merom"/"Conroe" */ - case 22: /* single-core 65 nm celeron/core2solo "Merom-L"/"Conroe-L" */ - case 23: /* current 45 nm celeron/core2/xeon "Penryn"/"Wolfdale" */ - case 29: /* six-core 45 nm xeon "Dunnington" */ - memcpy(hw_cache_event_ids, core2_hw_cache_event_ids, - sizeof(hw_cache_event_ids)); - - pr_cont("Core2 events, "); - break; - default: - case 26: - memcpy(hw_cache_event_ids, nehalem_hw_cache_event_ids, - sizeof(hw_cache_event_ids)); - - pr_cont("Nehalem/Corei7 events, "); - break; - case 28: - memcpy(hw_cache_event_ids, atom_hw_cache_event_ids, - sizeof(hw_cache_event_ids)); - - pr_cont("Atom events, "); - break; - } - return 0; -} - -static int amd_pmu_init(void) -{ - /* Performance-monitoring supported from K7 and later: */ - if (boot_cpu_data.x86 < 6) - return -ENODEV; - - x86_pmu = amd_pmu; - - /* Events are common for all AMDs */ - memcpy(hw_cache_event_ids, amd_hw_cache_event_ids, - sizeof(hw_cache_event_ids)); - - return 0; -} - -void __init init_hw_perf_counters(void) -{ - int err; - - pr_info("Performance Counters: "); - - switch (boot_cpu_data.x86_vendor) { - case X86_VENDOR_INTEL: - err = intel_pmu_init(); - break; - case X86_VENDOR_AMD: - err = amd_pmu_init(); - break; - default: - return; - } - if (err != 0) { - pr_cont("no PMU driver, software counters only.\n"); - return; - } - - pr_cont("%s PMU driver.\n", x86_pmu.name); - - if (x86_pmu.num_counters > X86_PMC_MAX_GENERIC) { - WARN(1, KERN_ERR "hw perf counters %d > max(%d), clipping!", - x86_pmu.num_counters, X86_PMC_MAX_GENERIC); - x86_pmu.num_counters = X86_PMC_MAX_GENERIC; - } - perf_counter_mask = (1 << x86_pmu.num_counters) - 1; - perf_max_counters = x86_pmu.num_counters; - - if (x86_pmu.num_counters_fixed > X86_PMC_MAX_FIXED) { - WARN(1, KERN_ERR "hw perf counters fixed %d > max(%d), clipping!", - x86_pmu.num_counters_fixed, X86_PMC_MAX_FIXED); - x86_pmu.num_counters_fixed = X86_PMC_MAX_FIXED; - } - - perf_counter_mask |= - ((1LL << x86_pmu.num_counters_fixed)-1) << X86_PMC_IDX_FIXED; - x86_pmu.intel_ctrl = perf_counter_mask; - - perf_counters_lapic_init(); - register_die_notifier(&perf_counter_nmi_notifier); - - pr_info("... version: %d\n", x86_pmu.version); - pr_info("... bit width: %d\n", x86_pmu.counter_bits); - pr_info("... generic counters: %d\n", x86_pmu.num_counters); - pr_info("... value mask: %016Lx\n", x86_pmu.counter_mask); - pr_info("... max period: %016Lx\n", x86_pmu.max_period); - pr_info("... fixed-purpose counters: %d\n", x86_pmu.num_counters_fixed); - pr_info("... counter mask: %016Lx\n", perf_counter_mask); -} - -static inline void x86_pmu_read(struct perf_counter *counter) -{ - x86_perf_counter_update(counter, &counter->hw, counter->hw.idx); -} - -static const struct pmu pmu = { - .enable = x86_pmu_enable, - .disable = x86_pmu_disable, - .read = x86_pmu_read, - .unthrottle = x86_pmu_unthrottle, -}; - -const struct pmu *hw_perf_counter_init(struct perf_counter *counter) -{ - int err; - - err = __hw_perf_counter_init(counter); - if (err) { - if (counter->destroy) - counter->destroy(counter); - return ERR_PTR(err); - } - - return &pmu; -} - -/* - * callchain support - */ - -static inline -void callchain_store(struct perf_callchain_entry *entry, u64 ip) -{ - if (entry->nr < PERF_MAX_STACK_DEPTH) - entry->ip[entry->nr++] = ip; -} - -static DEFINE_PER_CPU(struct perf_callchain_entry, pmc_irq_entry); -static DEFINE_PER_CPU(struct perf_callchain_entry, pmc_nmi_entry); -static DEFINE_PER_CPU(int, in_nmi_frame); - - -static void -backtrace_warning_symbol(void *data, char *msg, unsigned long symbol) -{ - /* Ignore warnings */ -} - -static void backtrace_warning(void *data, char *msg) -{ - /* Ignore warnings */ -} - -static int backtrace_stack(void *data, char *name) -{ - per_cpu(in_nmi_frame, smp_processor_id()) = - x86_is_stack_id(NMI_STACK, name); - - return 0; -} - -static void backtrace_address(void *data, unsigned long addr, int reliable) -{ - struct perf_callchain_entry *entry = data; - - if (per_cpu(in_nmi_frame, smp_processor_id())) - return; - - if (reliable) - callchain_store(entry, addr); -} - -static const struct stacktrace_ops backtrace_ops = { - .warning = backtrace_warning, - .warning_symbol = backtrace_warning_symbol, - .stack = backtrace_stack, - .address = backtrace_address, -}; - -#include "../dumpstack.h" - -static void -perf_callchain_kernel(struct pt_regs *regs, struct perf_callchain_entry *entry) -{ - callchain_store(entry, PERF_CONTEXT_KERNEL); - callchain_store(entry, regs->ip); - - dump_trace(NULL, regs, NULL, 0, &backtrace_ops, entry); -} - -/* - * best effort, GUP based copy_from_user() that assumes IRQ or NMI context - */ -static unsigned long -copy_from_user_nmi(void *to, const void __user *from, unsigned long n) -{ - unsigned long offset, addr = (unsigned long)from; - int type = in_nmi() ? KM_NMI : KM_IRQ0; - unsigned long size, len = 0; - struct page *page; - void *map; - int ret; - - do { - ret = __get_user_pages_fast(addr, 1, 0, &page); - if (!ret) - break; - - offset = addr & (PAGE_SIZE - 1); - size = min(PAGE_SIZE - offset, n - len); - - map = kmap_atomic(page, type); - memcpy(to, map+offset, size); - kunmap_atomic(map, type); - put_page(page); - - len += size; - to += size; - addr += size; - - } while (len < n); - - return len; -} - -static int copy_stack_frame(const void __user *fp, struct stack_frame *frame) -{ - unsigned long bytes; - - bytes = copy_from_user_nmi(frame, fp, sizeof(*frame)); - - return bytes == sizeof(*frame); -} - -static void -perf_callchain_user(struct pt_regs *regs, struct perf_callchain_entry *entry) -{ - struct stack_frame frame; - const void __user *fp; - - if (!user_mode(regs)) - regs = task_pt_regs(current); - - fp = (void __user *)regs->bp; - - callchain_store(entry, PERF_CONTEXT_USER); - callchain_store(entry, regs->ip); - - while (entry->nr < PERF_MAX_STACK_DEPTH) { - frame.next_frame = NULL; - frame.return_address = 0; - - if (!copy_stack_frame(fp, &frame)) - break; - - if ((unsigned long)fp < regs->sp) - break; - - callchain_store(entry, frame.return_address); - fp = frame.next_frame; - } -} - -static void -perf_do_callchain(struct pt_regs *regs, struct perf_callchain_entry *entry) -{ - int is_user; - - if (!regs) - return; - - is_user = user_mode(regs); - - if (!current || current->pid == 0) - return; - - if (is_user && current->state != TASK_RUNNING) - return; - - if (!is_user) - perf_callchain_kernel(regs, entry); - - if (current->mm) - perf_callchain_user(regs, entry); -} - -struct perf_callchain_entry *perf_callchain(struct pt_regs *regs) -{ - struct perf_callchain_entry *entry; - - if (in_nmi()) - entry = &__get_cpu_var(pmc_nmi_entry); - else - entry = &__get_cpu_var(pmc_irq_entry); - - entry->nr = 0; - - perf_do_callchain(regs, entry); - - return entry; -} - -void hw_perf_counter_setup_online(int cpu) -{ - init_debug_store_on_cpu(cpu); -} diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c new file mode 100644 index 000000000000..0d03629fb1a5 --- /dev/null +++ b/arch/x86/kernel/cpu/perf_event.c @@ -0,0 +1,2298 @@ +/* + * Performance events x86 architecture code + * + * Copyright (C) 2008 Thomas Gleixner + * Copyright (C) 2008-2009 Red Hat, Inc., Ingo Molnar + * Copyright (C) 2009 Jaswinder Singh Rajput + * Copyright (C) 2009 Advanced Micro Devices, Inc., Robert Richter + * Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra + * Copyright (C) 2009 Intel Corporation, + * + * For licencing details see kernel-base/COPYING + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +static u64 perf_event_mask __read_mostly; + +/* The maximal number of PEBS events: */ +#define MAX_PEBS_EVENTS 4 + +/* The size of a BTS record in bytes: */ +#define BTS_RECORD_SIZE 24 + +/* The size of a per-cpu BTS buffer in bytes: */ +#define BTS_BUFFER_SIZE (BTS_RECORD_SIZE * 2048) + +/* The BTS overflow threshold in bytes from the end of the buffer: */ +#define BTS_OVFL_TH (BTS_RECORD_SIZE * 128) + + +/* + * Bits in the debugctlmsr controlling branch tracing. + */ +#define X86_DEBUGCTL_TR (1 << 6) +#define X86_DEBUGCTL_BTS (1 << 7) +#define X86_DEBUGCTL_BTINT (1 << 8) +#define X86_DEBUGCTL_BTS_OFF_OS (1 << 9) +#define X86_DEBUGCTL_BTS_OFF_USR (1 << 10) + +/* + * A debug store configuration. + * + * We only support architectures that use 64bit fields. + */ +struct debug_store { + u64 bts_buffer_base; + u64 bts_index; + u64 bts_absolute_maximum; + u64 bts_interrupt_threshold; + u64 pebs_buffer_base; + u64 pebs_index; + u64 pebs_absolute_maximum; + u64 pebs_interrupt_threshold; + u64 pebs_event_reset[MAX_PEBS_EVENTS]; +}; + +struct cpu_hw_events { + struct perf_event *events[X86_PMC_IDX_MAX]; + unsigned long used_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; + unsigned long active_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; + unsigned long interrupts; + int enabled; + struct debug_store *ds; +}; + +/* + * struct x86_pmu - generic x86 pmu + */ +struct x86_pmu { + const char *name; + int version; + int (*handle_irq)(struct pt_regs *); + void (*disable_all)(void); + void (*enable_all)(void); + void (*enable)(struct hw_perf_event *, int); + void (*disable)(struct hw_perf_event *, int); + unsigned eventsel; + unsigned perfctr; + u64 (*event_map)(int); + u64 (*raw_event)(u64); + int max_events; + int num_events; + int num_events_fixed; + int event_bits; + u64 event_mask; + int apic; + u64 max_period; + u64 intel_ctrl; + void (*enable_bts)(u64 config); + void (*disable_bts)(void); +}; + +static struct x86_pmu x86_pmu __read_mostly; + +static DEFINE_PER_CPU(struct cpu_hw_events, cpu_hw_events) = { + .enabled = 1, +}; + +/* + * Not sure about some of these + */ +static const u64 p6_perfmon_event_map[] = +{ + [PERF_COUNT_HW_CPU_CYCLES] = 0x0079, + [PERF_COUNT_HW_INSTRUCTIONS] = 0x00c0, + [PERF_COUNT_HW_CACHE_REFERENCES] = 0x0f2e, + [PERF_COUNT_HW_CACHE_MISSES] = 0x012e, + [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x00c4, + [PERF_COUNT_HW_BRANCH_MISSES] = 0x00c5, + [PERF_COUNT_HW_BUS_CYCLES] = 0x0062, +}; + +static u64 p6_pmu_event_map(int hw_event) +{ + return p6_perfmon_event_map[hw_event]; +} + +/* + * Event setting that is specified not to count anything. + * We use this to effectively disable a counter. + * + * L2_RQSTS with 0 MESI unit mask. + */ +#define P6_NOP_EVENT 0x0000002EULL + +static u64 p6_pmu_raw_event(u64 hw_event) +{ +#define P6_EVNTSEL_EVENT_MASK 0x000000FFULL +#define P6_EVNTSEL_UNIT_MASK 0x0000FF00ULL +#define P6_EVNTSEL_EDGE_MASK 0x00040000ULL +#define P6_EVNTSEL_INV_MASK 0x00800000ULL +#define P6_EVNTSEL_REG_MASK 0xFF000000ULL + +#define P6_EVNTSEL_MASK \ + (P6_EVNTSEL_EVENT_MASK | \ + P6_EVNTSEL_UNIT_MASK | \ + P6_EVNTSEL_EDGE_MASK | \ + P6_EVNTSEL_INV_MASK | \ + P6_EVNTSEL_REG_MASK) + + return hw_event & P6_EVNTSEL_MASK; +} + + +/* + * Intel PerfMon v3. Used on Core2 and later. + */ +static const u64 intel_perfmon_event_map[] = +{ + [PERF_COUNT_HW_CPU_CYCLES] = 0x003c, + [PERF_COUNT_HW_INSTRUCTIONS] = 0x00c0, + [PERF_COUNT_HW_CACHE_REFERENCES] = 0x4f2e, + [PERF_COUNT_HW_CACHE_MISSES] = 0x412e, + [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x00c4, + [PERF_COUNT_HW_BRANCH_MISSES] = 0x00c5, + [PERF_COUNT_HW_BUS_CYCLES] = 0x013c, +}; + +static u64 intel_pmu_event_map(int hw_event) +{ + return intel_perfmon_event_map[hw_event]; +} + +/* + * Generalized hw caching related hw_event table, filled + * in on a per model basis. A value of 0 means + * 'not supported', -1 means 'hw_event makes no sense on + * this CPU', any other value means the raw hw_event + * ID. + */ + +#define C(x) PERF_COUNT_HW_CACHE_##x + +static u64 __read_mostly hw_cache_event_ids + [PERF_COUNT_HW_CACHE_MAX] + [PERF_COUNT_HW_CACHE_OP_MAX] + [PERF_COUNT_HW_CACHE_RESULT_MAX]; + +static const u64 nehalem_hw_cache_event_ids + [PERF_COUNT_HW_CACHE_MAX] + [PERF_COUNT_HW_CACHE_OP_MAX] + [PERF_COUNT_HW_CACHE_RESULT_MAX] = +{ + [ C(L1D) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI */ + [ C(RESULT_MISS) ] = 0x0140, /* L1D_CACHE_LD.I_STATE */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI */ + [ C(RESULT_MISS) ] = 0x0141, /* L1D_CACHE_ST.I_STATE */ + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0x014e, /* L1D_PREFETCH.REQUESTS */ + [ C(RESULT_MISS) ] = 0x024e, /* L1D_PREFETCH.MISS */ + }, + }, + [ C(L1I ) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x0380, /* L1I.READS */ + [ C(RESULT_MISS) ] = 0x0280, /* L1I.MISSES */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0x0, + [ C(RESULT_MISS) ] = 0x0, + }, + }, + [ C(LL ) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x0324, /* L2_RQSTS.LOADS */ + [ C(RESULT_MISS) ] = 0x0224, /* L2_RQSTS.LD_MISS */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = 0x0c24, /* L2_RQSTS.RFOS */ + [ C(RESULT_MISS) ] = 0x0824, /* L2_RQSTS.RFO_MISS */ + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0x4f2e, /* LLC Reference */ + [ C(RESULT_MISS) ] = 0x412e, /* LLC Misses */ + }, + }, + [ C(DTLB) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI (alias) */ + [ C(RESULT_MISS) ] = 0x0108, /* DTLB_LOAD_MISSES.ANY */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI (alias) */ + [ C(RESULT_MISS) ] = 0x010c, /* MEM_STORE_RETIRED.DTLB_MISS */ + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0x0, + [ C(RESULT_MISS) ] = 0x0, + }, + }, + [ C(ITLB) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x01c0, /* INST_RETIRED.ANY_P */ + [ C(RESULT_MISS) ] = 0x20c8, /* ITLB_MISS_RETIRED */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + }, + [ C(BPU ) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ALL_BRANCHES */ + [ C(RESULT_MISS) ] = 0x03e8, /* BPU_CLEARS.ANY */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + }, +}; + +static const u64 core2_hw_cache_event_ids + [PERF_COUNT_HW_CACHE_MAX] + [PERF_COUNT_HW_CACHE_OP_MAX] + [PERF_COUNT_HW_CACHE_RESULT_MAX] = +{ + [ C(L1D) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI */ + [ C(RESULT_MISS) ] = 0x0140, /* L1D_CACHE_LD.I_STATE */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI */ + [ C(RESULT_MISS) ] = 0x0141, /* L1D_CACHE_ST.I_STATE */ + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0x104e, /* L1D_PREFETCH.REQUESTS */ + [ C(RESULT_MISS) ] = 0, + }, + }, + [ C(L1I ) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x0080, /* L1I.READS */ + [ C(RESULT_MISS) ] = 0x0081, /* L1I.MISSES */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0, + [ C(RESULT_MISS) ] = 0, + }, + }, + [ C(LL ) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x4f29, /* L2_LD.MESI */ + [ C(RESULT_MISS) ] = 0x4129, /* L2_LD.ISTATE */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = 0x4f2A, /* L2_ST.MESI */ + [ C(RESULT_MISS) ] = 0x412A, /* L2_ST.ISTATE */ + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0, + [ C(RESULT_MISS) ] = 0, + }, + }, + [ C(DTLB) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI (alias) */ + [ C(RESULT_MISS) ] = 0x0208, /* DTLB_MISSES.MISS_LD */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI (alias) */ + [ C(RESULT_MISS) ] = 0x0808, /* DTLB_MISSES.MISS_ST */ + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0, + [ C(RESULT_MISS) ] = 0, + }, + }, + [ C(ITLB) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x00c0, /* INST_RETIRED.ANY_P */ + [ C(RESULT_MISS) ] = 0x1282, /* ITLBMISSES */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + }, + [ C(BPU ) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ANY */ + [ C(RESULT_MISS) ] = 0x00c5, /* BP_INST_RETIRED.MISPRED */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + }, +}; + +static const u64 atom_hw_cache_event_ids + [PERF_COUNT_HW_CACHE_MAX] + [PERF_COUNT_HW_CACHE_OP_MAX] + [PERF_COUNT_HW_CACHE_RESULT_MAX] = +{ + [ C(L1D) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x2140, /* L1D_CACHE.LD */ + [ C(RESULT_MISS) ] = 0, + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = 0x2240, /* L1D_CACHE.ST */ + [ C(RESULT_MISS) ] = 0, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0x0, + [ C(RESULT_MISS) ] = 0, + }, + }, + [ C(L1I ) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x0380, /* L1I.READS */ + [ C(RESULT_MISS) ] = 0x0280, /* L1I.MISSES */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0, + [ C(RESULT_MISS) ] = 0, + }, + }, + [ C(LL ) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x4f29, /* L2_LD.MESI */ + [ C(RESULT_MISS) ] = 0x4129, /* L2_LD.ISTATE */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = 0x4f2A, /* L2_ST.MESI */ + [ C(RESULT_MISS) ] = 0x412A, /* L2_ST.ISTATE */ + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0, + [ C(RESULT_MISS) ] = 0, + }, + }, + [ C(DTLB) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x2140, /* L1D_CACHE_LD.MESI (alias) */ + [ C(RESULT_MISS) ] = 0x0508, /* DTLB_MISSES.MISS_LD */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = 0x2240, /* L1D_CACHE_ST.MESI (alias) */ + [ C(RESULT_MISS) ] = 0x0608, /* DTLB_MISSES.MISS_ST */ + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0, + [ C(RESULT_MISS) ] = 0, + }, + }, + [ C(ITLB) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x00c0, /* INST_RETIRED.ANY_P */ + [ C(RESULT_MISS) ] = 0x0282, /* ITLB.MISSES */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + }, + [ C(BPU ) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ANY */ + [ C(RESULT_MISS) ] = 0x00c5, /* BP_INST_RETIRED.MISPRED */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + }, +}; + +static u64 intel_pmu_raw_event(u64 hw_event) +{ +#define CORE_EVNTSEL_EVENT_MASK 0x000000FFULL +#define CORE_EVNTSEL_UNIT_MASK 0x0000FF00ULL +#define CORE_EVNTSEL_EDGE_MASK 0x00040000ULL +#define CORE_EVNTSEL_INV_MASK 0x00800000ULL +#define CORE_EVNTSEL_REG_MASK 0xFF000000ULL + +#define CORE_EVNTSEL_MASK \ + (CORE_EVNTSEL_EVENT_MASK | \ + CORE_EVNTSEL_UNIT_MASK | \ + CORE_EVNTSEL_EDGE_MASK | \ + CORE_EVNTSEL_INV_MASK | \ + CORE_EVNTSEL_REG_MASK) + + return hw_event & CORE_EVNTSEL_MASK; +} + +static const u64 amd_hw_cache_event_ids + [PERF_COUNT_HW_CACHE_MAX] + [PERF_COUNT_HW_CACHE_OP_MAX] + [PERF_COUNT_HW_CACHE_RESULT_MAX] = +{ + [ C(L1D) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x0040, /* Data Cache Accesses */ + [ C(RESULT_MISS) ] = 0x0041, /* Data Cache Misses */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = 0x0142, /* Data Cache Refills :system */ + [ C(RESULT_MISS) ] = 0, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0x0267, /* Data Prefetcher :attempts */ + [ C(RESULT_MISS) ] = 0x0167, /* Data Prefetcher :cancelled */ + }, + }, + [ C(L1I ) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x0080, /* Instruction cache fetches */ + [ C(RESULT_MISS) ] = 0x0081, /* Instruction cache misses */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0x014B, /* Prefetch Instructions :Load */ + [ C(RESULT_MISS) ] = 0, + }, + }, + [ C(LL ) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x037D, /* Requests to L2 Cache :IC+DC */ + [ C(RESULT_MISS) ] = 0x037E, /* L2 Cache Misses : IC+DC */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = 0x017F, /* L2 Fill/Writeback */ + [ C(RESULT_MISS) ] = 0, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0, + [ C(RESULT_MISS) ] = 0, + }, + }, + [ C(DTLB) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x0040, /* Data Cache Accesses */ + [ C(RESULT_MISS) ] = 0x0046, /* L1 DTLB and L2 DLTB Miss */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = 0, + [ C(RESULT_MISS) ] = 0, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0, + [ C(RESULT_MISS) ] = 0, + }, + }, + [ C(ITLB) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x0080, /* Instruction fecthes */ + [ C(RESULT_MISS) ] = 0x0085, /* Instr. fetch ITLB misses */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + }, + [ C(BPU ) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x00c2, /* Retired Branch Instr. */ + [ C(RESULT_MISS) ] = 0x00c3, /* Retired Mispredicted BI */ + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + }, +}; + +/* + * AMD Performance Monitor K7 and later. + */ +static const u64 amd_perfmon_event_map[] = +{ + [PERF_COUNT_HW_CPU_CYCLES] = 0x0076, + [PERF_COUNT_HW_INSTRUCTIONS] = 0x00c0, + [PERF_COUNT_HW_CACHE_REFERENCES] = 0x0080, + [PERF_COUNT_HW_CACHE_MISSES] = 0x0081, + [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x00c4, + [PERF_COUNT_HW_BRANCH_MISSES] = 0x00c5, +}; + +static u64 amd_pmu_event_map(int hw_event) +{ + return amd_perfmon_event_map[hw_event]; +} + +static u64 amd_pmu_raw_event(u64 hw_event) +{ +#define K7_EVNTSEL_EVENT_MASK 0x7000000FFULL +#define K7_EVNTSEL_UNIT_MASK 0x00000FF00ULL +#define K7_EVNTSEL_EDGE_MASK 0x000040000ULL +#define K7_EVNTSEL_INV_MASK 0x000800000ULL +#define K7_EVNTSEL_REG_MASK 0x0FF000000ULL + +#define K7_EVNTSEL_MASK \ + (K7_EVNTSEL_EVENT_MASK | \ + K7_EVNTSEL_UNIT_MASK | \ + K7_EVNTSEL_EDGE_MASK | \ + K7_EVNTSEL_INV_MASK | \ + K7_EVNTSEL_REG_MASK) + + return hw_event & K7_EVNTSEL_MASK; +} + +/* + * Propagate event elapsed time into the generic event. + * Can only be executed on the CPU where the event is active. + * Returns the delta events processed. + */ +static u64 +x86_perf_event_update(struct perf_event *event, + struct hw_perf_event *hwc, int idx) +{ + int shift = 64 - x86_pmu.event_bits; + u64 prev_raw_count, new_raw_count; + s64 delta; + + if (idx == X86_PMC_IDX_FIXED_BTS) + return 0; + + /* + * Careful: an NMI might modify the previous event value. + * + * Our tactic to handle this is to first atomically read and + * exchange a new raw count - then add that new-prev delta + * count to the generic event atomically: + */ +again: + prev_raw_count = atomic64_read(&hwc->prev_count); + rdmsrl(hwc->event_base + idx, new_raw_count); + + if (atomic64_cmpxchg(&hwc->prev_count, prev_raw_count, + new_raw_count) != prev_raw_count) + goto again; + + /* + * Now we have the new raw value and have updated the prev + * timestamp already. We can now calculate the elapsed delta + * (event-)time and add that to the generic event. + * + * Careful, not all hw sign-extends above the physical width + * of the count. + */ + delta = (new_raw_count << shift) - (prev_raw_count << shift); + delta >>= shift; + + atomic64_add(delta, &event->count); + atomic64_sub(delta, &hwc->period_left); + + return new_raw_count; +} + +static atomic_t active_events; +static DEFINE_MUTEX(pmc_reserve_mutex); + +static bool reserve_pmc_hardware(void) +{ +#ifdef CONFIG_X86_LOCAL_APIC + int i; + + if (nmi_watchdog == NMI_LOCAL_APIC) + disable_lapic_nmi_watchdog(); + + for (i = 0; i < x86_pmu.num_events; i++) { + if (!reserve_perfctr_nmi(x86_pmu.perfctr + i)) + goto perfctr_fail; + } + + for (i = 0; i < x86_pmu.num_events; i++) { + if (!reserve_evntsel_nmi(x86_pmu.eventsel + i)) + goto eventsel_fail; + } +#endif + + return true; + +#ifdef CONFIG_X86_LOCAL_APIC +eventsel_fail: + for (i--; i >= 0; i--) + release_evntsel_nmi(x86_pmu.eventsel + i); + + i = x86_pmu.num_events; + +perfctr_fail: + for (i--; i >= 0; i--) + release_perfctr_nmi(x86_pmu.perfctr + i); + + if (nmi_watchdog == NMI_LOCAL_APIC) + enable_lapic_nmi_watchdog(); + + return false; +#endif +} + +static void release_pmc_hardware(void) +{ +#ifdef CONFIG_X86_LOCAL_APIC + int i; + + for (i = 0; i < x86_pmu.num_events; i++) { + release_perfctr_nmi(x86_pmu.perfctr + i); + release_evntsel_nmi(x86_pmu.eventsel + i); + } + + if (nmi_watchdog == NMI_LOCAL_APIC) + enable_lapic_nmi_watchdog(); +#endif +} + +static inline bool bts_available(void) +{ + return x86_pmu.enable_bts != NULL; +} + +static inline void init_debug_store_on_cpu(int cpu) +{ + struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds; + + if (!ds) + return; + + wrmsr_on_cpu(cpu, MSR_IA32_DS_AREA, + (u32)((u64)(unsigned long)ds), + (u32)((u64)(unsigned long)ds >> 32)); +} + +static inline void fini_debug_store_on_cpu(int cpu) +{ + if (!per_cpu(cpu_hw_events, cpu).ds) + return; + + wrmsr_on_cpu(cpu, MSR_IA32_DS_AREA, 0, 0); +} + +static void release_bts_hardware(void) +{ + int cpu; + + if (!bts_available()) + return; + + get_online_cpus(); + + for_each_online_cpu(cpu) + fini_debug_store_on_cpu(cpu); + + for_each_possible_cpu(cpu) { + struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds; + + if (!ds) + continue; + + per_cpu(cpu_hw_events, cpu).ds = NULL; + + kfree((void *)(unsigned long)ds->bts_buffer_base); + kfree(ds); + } + + put_online_cpus(); +} + +static int reserve_bts_hardware(void) +{ + int cpu, err = 0; + + if (!bts_available()) + return 0; + + get_online_cpus(); + + for_each_possible_cpu(cpu) { + struct debug_store *ds; + void *buffer; + + err = -ENOMEM; + buffer = kzalloc(BTS_BUFFER_SIZE, GFP_KERNEL); + if (unlikely(!buffer)) + break; + + ds = kzalloc(sizeof(*ds), GFP_KERNEL); + if (unlikely(!ds)) { + kfree(buffer); + break; + } + + ds->bts_buffer_base = (u64)(unsigned long)buffer; + ds->bts_index = ds->bts_buffer_base; + ds->bts_absolute_maximum = + ds->bts_buffer_base + BTS_BUFFER_SIZE; + ds->bts_interrupt_threshold = + ds->bts_absolute_maximum - BTS_OVFL_TH; + + per_cpu(cpu_hw_events, cpu).ds = ds; + err = 0; + } + + if (err) + release_bts_hardware(); + else { + for_each_online_cpu(cpu) + init_debug_store_on_cpu(cpu); + } + + put_online_cpus(); + + return err; +} + +static void hw_perf_event_destroy(struct perf_event *event) +{ + if (atomic_dec_and_mutex_lock(&active_events, &pmc_reserve_mutex)) { + release_pmc_hardware(); + release_bts_hardware(); + mutex_unlock(&pmc_reserve_mutex); + } +} + +static inline int x86_pmu_initialized(void) +{ + return x86_pmu.handle_irq != NULL; +} + +static inline int +set_ext_hw_attr(struct hw_perf_event *hwc, struct perf_event_attr *attr) +{ + unsigned int cache_type, cache_op, cache_result; + u64 config, val; + + config = attr->config; + + cache_type = (config >> 0) & 0xff; + if (cache_type >= PERF_COUNT_HW_CACHE_MAX) + return -EINVAL; + + cache_op = (config >> 8) & 0xff; + if (cache_op >= PERF_COUNT_HW_CACHE_OP_MAX) + return -EINVAL; + + cache_result = (config >> 16) & 0xff; + if (cache_result >= PERF_COUNT_HW_CACHE_RESULT_MAX) + return -EINVAL; + + val = hw_cache_event_ids[cache_type][cache_op][cache_result]; + + if (val == 0) + return -ENOENT; + + if (val == -1) + return -EINVAL; + + hwc->config |= val; + + return 0; +} + +static void intel_pmu_enable_bts(u64 config) +{ + unsigned long debugctlmsr; + + debugctlmsr = get_debugctlmsr(); + + debugctlmsr |= X86_DEBUGCTL_TR; + debugctlmsr |= X86_DEBUGCTL_BTS; + debugctlmsr |= X86_DEBUGCTL_BTINT; + + if (!(config & ARCH_PERFMON_EVENTSEL_OS)) + debugctlmsr |= X86_DEBUGCTL_BTS_OFF_OS; + + if (!(config & ARCH_PERFMON_EVENTSEL_USR)) + debugctlmsr |= X86_DEBUGCTL_BTS_OFF_USR; + + update_debugctlmsr(debugctlmsr); +} + +static void intel_pmu_disable_bts(void) +{ + struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); + unsigned long debugctlmsr; + + if (!cpuc->ds) + return; + + debugctlmsr = get_debugctlmsr(); + + debugctlmsr &= + ~(X86_DEBUGCTL_TR | X86_DEBUGCTL_BTS | X86_DEBUGCTL_BTINT | + X86_DEBUGCTL_BTS_OFF_OS | X86_DEBUGCTL_BTS_OFF_USR); + + update_debugctlmsr(debugctlmsr); +} + +/* + * Setup the hardware configuration for a given attr_type + */ +static int __hw_perf_event_init(struct perf_event *event) +{ + struct perf_event_attr *attr = &event->attr; + struct hw_perf_event *hwc = &event->hw; + u64 config; + int err; + + if (!x86_pmu_initialized()) + return -ENODEV; + + err = 0; + if (!atomic_inc_not_zero(&active_events)) { + mutex_lock(&pmc_reserve_mutex); + if (atomic_read(&active_events) == 0) { + if (!reserve_pmc_hardware()) + err = -EBUSY; + else + err = reserve_bts_hardware(); + } + if (!err) + atomic_inc(&active_events); + mutex_unlock(&pmc_reserve_mutex); + } + if (err) + return err; + + event->destroy = hw_perf_event_destroy; + + /* + * Generate PMC IRQs: + * (keep 'enabled' bit clear for now) + */ + hwc->config = ARCH_PERFMON_EVENTSEL_INT; + + /* + * Count user and OS events unless requested not to. + */ + if (!attr->exclude_user) + hwc->config |= ARCH_PERFMON_EVENTSEL_USR; + if (!attr->exclude_kernel) + hwc->config |= ARCH_PERFMON_EVENTSEL_OS; + + if (!hwc->sample_period) { + hwc->sample_period = x86_pmu.max_period; + hwc->last_period = hwc->sample_period; + atomic64_set(&hwc->period_left, hwc->sample_period); + } else { + /* + * If we have a PMU initialized but no APIC + * interrupts, we cannot sample hardware + * events (user-space has to fall back and + * sample via a hrtimer based software event): + */ + if (!x86_pmu.apic) + return -EOPNOTSUPP; + } + + /* + * Raw hw_event type provide the config in the hw_event structure + */ + if (attr->type == PERF_TYPE_RAW) { + hwc->config |= x86_pmu.raw_event(attr->config); + return 0; + } + + if (attr->type == PERF_TYPE_HW_CACHE) + return set_ext_hw_attr(hwc, attr); + + if (attr->config >= x86_pmu.max_events) + return -EINVAL; + + /* + * The generic map: + */ + config = x86_pmu.event_map(attr->config); + + if (config == 0) + return -ENOENT; + + if (config == -1LL) + return -EINVAL; + + /* + * Branch tracing: + */ + if ((attr->config == PERF_COUNT_HW_BRANCH_INSTRUCTIONS) && + (hwc->sample_period == 1)) { + /* BTS is not supported by this architecture. */ + if (!bts_available()) + return -EOPNOTSUPP; + + /* BTS is currently only allowed for user-mode. */ + if (hwc->config & ARCH_PERFMON_EVENTSEL_OS) + return -EOPNOTSUPP; + } + + hwc->config |= config; + + return 0; +} + +static void p6_pmu_disable_all(void) +{ + struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); + u64 val; + + if (!cpuc->enabled) + return; + + cpuc->enabled = 0; + barrier(); + + /* p6 only has one enable register */ + rdmsrl(MSR_P6_EVNTSEL0, val); + val &= ~ARCH_PERFMON_EVENTSEL0_ENABLE; + wrmsrl(MSR_P6_EVNTSEL0, val); +} + +static void intel_pmu_disable_all(void) +{ + struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); + + if (!cpuc->enabled) + return; + + cpuc->enabled = 0; + barrier(); + + wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0); + + if (test_bit(X86_PMC_IDX_FIXED_BTS, cpuc->active_mask)) + intel_pmu_disable_bts(); +} + +static void amd_pmu_disable_all(void) +{ + struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); + int idx; + + if (!cpuc->enabled) + return; + + cpuc->enabled = 0; + /* + * ensure we write the disable before we start disabling the + * events proper, so that amd_pmu_enable_event() does the + * right thing. + */ + barrier(); + + for (idx = 0; idx < x86_pmu.num_events; idx++) { + u64 val; + + if (!test_bit(idx, cpuc->active_mask)) + continue; + rdmsrl(MSR_K7_EVNTSEL0 + idx, val); + if (!(val & ARCH_PERFMON_EVENTSEL0_ENABLE)) + continue; + val &= ~ARCH_PERFMON_EVENTSEL0_ENABLE; + wrmsrl(MSR_K7_EVNTSEL0 + idx, val); + } +} + +void hw_perf_disable(void) +{ + if (!x86_pmu_initialized()) + return; + return x86_pmu.disable_all(); +} + +static void p6_pmu_enable_all(void) +{ + struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); + unsigned long val; + + if (cpuc->enabled) + return; + + cpuc->enabled = 1; + barrier(); + + /* p6 only has one enable register */ + rdmsrl(MSR_P6_EVNTSEL0, val); + val |= ARCH_PERFMON_EVENTSEL0_ENABLE; + wrmsrl(MSR_P6_EVNTSEL0, val); +} + +static void intel_pmu_enable_all(void) +{ + struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); + + if (cpuc->enabled) + return; + + cpuc->enabled = 1; + barrier(); + + wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, x86_pmu.intel_ctrl); + + if (test_bit(X86_PMC_IDX_FIXED_BTS, cpuc->active_mask)) { + struct perf_event *event = + cpuc->events[X86_PMC_IDX_FIXED_BTS]; + + if (WARN_ON_ONCE(!event)) + return; + + intel_pmu_enable_bts(event->hw.config); + } +} + +static void amd_pmu_enable_all(void) +{ + struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); + int idx; + + if (cpuc->enabled) + return; + + cpuc->enabled = 1; + barrier(); + + for (idx = 0; idx < x86_pmu.num_events; idx++) { + struct perf_event *event = cpuc->events[idx]; + u64 val; + + if (!test_bit(idx, cpuc->active_mask)) + continue; + + val = event->hw.config; + val |= ARCH_PERFMON_EVENTSEL0_ENABLE; + wrmsrl(MSR_K7_EVNTSEL0 + idx, val); + } +} + +void hw_perf_enable(void) +{ + if (!x86_pmu_initialized()) + return; + x86_pmu.enable_all(); +} + +static inline u64 intel_pmu_get_status(void) +{ + u64 status; + + rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status); + + return status; +} + +static inline void intel_pmu_ack_status(u64 ack) +{ + wrmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, ack); +} + +static inline void x86_pmu_enable_event(struct hw_perf_event *hwc, int idx) +{ + (void)checking_wrmsrl(hwc->config_base + idx, + hwc->config | ARCH_PERFMON_EVENTSEL0_ENABLE); +} + +static inline void x86_pmu_disable_event(struct hw_perf_event *hwc, int idx) +{ + (void)checking_wrmsrl(hwc->config_base + idx, hwc->config); +} + +static inline void +intel_pmu_disable_fixed(struct hw_perf_event *hwc, int __idx) +{ + int idx = __idx - X86_PMC_IDX_FIXED; + u64 ctrl_val, mask; + + mask = 0xfULL << (idx * 4); + + rdmsrl(hwc->config_base, ctrl_val); + ctrl_val &= ~mask; + (void)checking_wrmsrl(hwc->config_base, ctrl_val); +} + +static inline void +p6_pmu_disable_event(struct hw_perf_event *hwc, int idx) +{ + struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); + u64 val = P6_NOP_EVENT; + + if (cpuc->enabled) + val |= ARCH_PERFMON_EVENTSEL0_ENABLE; + + (void)checking_wrmsrl(hwc->config_base + idx, val); +} + +static inline void +intel_pmu_disable_event(struct hw_perf_event *hwc, int idx) +{ + if (unlikely(idx == X86_PMC_IDX_FIXED_BTS)) { + intel_pmu_disable_bts(); + return; + } + + if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) { + intel_pmu_disable_fixed(hwc, idx); + return; + } + + x86_pmu_disable_event(hwc, idx); +} + +static inline void +amd_pmu_disable_event(struct hw_perf_event *hwc, int idx) +{ + x86_pmu_disable_event(hwc, idx); +} + +static DEFINE_PER_CPU(u64 [X86_PMC_IDX_MAX], pmc_prev_left); + +/* + * Set the next IRQ period, based on the hwc->period_left value. + * To be called with the event disabled in hw: + */ +static int +x86_perf_event_set_period(struct perf_event *event, + struct hw_perf_event *hwc, int idx) +{ + s64 left = atomic64_read(&hwc->period_left); + s64 period = hwc->sample_period; + int err, ret = 0; + + if (idx == X86_PMC_IDX_FIXED_BTS) + return 0; + + /* + * If we are way outside a reasoable range then just skip forward: + */ + if (unlikely(left <= -period)) { + left = period; + atomic64_set(&hwc->period_left, left); + hwc->last_period = period; + ret = 1; + } + + if (unlikely(left <= 0)) { + left += period; + atomic64_set(&hwc->period_left, left); + hwc->last_period = period; + ret = 1; + } + /* + * Quirk: certain CPUs dont like it if just 1 hw_event is left: + */ + if (unlikely(left < 2)) + left = 2; + + if (left > x86_pmu.max_period) + left = x86_pmu.max_period; + + per_cpu(pmc_prev_left[idx], smp_processor_id()) = left; + + /* + * The hw event starts counting from this event offset, + * mark it to be able to extra future deltas: + */ + atomic64_set(&hwc->prev_count, (u64)-left); + + err = checking_wrmsrl(hwc->event_base + idx, + (u64)(-left) & x86_pmu.event_mask); + + perf_event_update_userpage(event); + + return ret; +} + +static inline void +intel_pmu_enable_fixed(struct hw_perf_event *hwc, int __idx) +{ + int idx = __idx - X86_PMC_IDX_FIXED; + u64 ctrl_val, bits, mask; + int err; + + /* + * Enable IRQ generation (0x8), + * and enable ring-3 counting (0x2) and ring-0 counting (0x1) + * if requested: + */ + bits = 0x8ULL; + if (hwc->config & ARCH_PERFMON_EVENTSEL_USR) + bits |= 0x2; + if (hwc->config & ARCH_PERFMON_EVENTSEL_OS) + bits |= 0x1; + bits <<= (idx * 4); + mask = 0xfULL << (idx * 4); + + rdmsrl(hwc->config_base, ctrl_val); + ctrl_val &= ~mask; + ctrl_val |= bits; + err = checking_wrmsrl(hwc->config_base, ctrl_val); +} + +static void p6_pmu_enable_event(struct hw_perf_event *hwc, int idx) +{ + struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); + u64 val; + + val = hwc->config; + if (cpuc->enabled) + val |= ARCH_PERFMON_EVENTSEL0_ENABLE; + + (void)checking_wrmsrl(hwc->config_base + idx, val); +} + + +static void intel_pmu_enable_event(struct hw_perf_event *hwc, int idx) +{ + if (unlikely(idx == X86_PMC_IDX_FIXED_BTS)) { + if (!__get_cpu_var(cpu_hw_events).enabled) + return; + + intel_pmu_enable_bts(hwc->config); + return; + } + + if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) { + intel_pmu_enable_fixed(hwc, idx); + return; + } + + x86_pmu_enable_event(hwc, idx); +} + +static void amd_pmu_enable_event(struct hw_perf_event *hwc, int idx) +{ + struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); + + if (cpuc->enabled) + x86_pmu_enable_event(hwc, idx); +} + +static int +fixed_mode_idx(struct perf_event *event, struct hw_perf_event *hwc) +{ + unsigned int hw_event; + + hw_event = hwc->config & ARCH_PERFMON_EVENT_MASK; + + if (unlikely((hw_event == + x86_pmu.event_map(PERF_COUNT_HW_BRANCH_INSTRUCTIONS)) && + (hwc->sample_period == 1))) + return X86_PMC_IDX_FIXED_BTS; + + if (!x86_pmu.num_events_fixed) + return -1; + + if (unlikely(hw_event == x86_pmu.event_map(PERF_COUNT_HW_INSTRUCTIONS))) + return X86_PMC_IDX_FIXED_INSTRUCTIONS; + if (unlikely(hw_event == x86_pmu.event_map(PERF_COUNT_HW_CPU_CYCLES))) + return X86_PMC_IDX_FIXED_CPU_CYCLES; + if (unlikely(hw_event == x86_pmu.event_map(PERF_COUNT_HW_BUS_CYCLES))) + return X86_PMC_IDX_FIXED_BUS_CYCLES; + + return -1; +} + +/* + * Find a PMC slot for the freshly enabled / scheduled in event: + */ +static int x86_pmu_enable(struct perf_event *event) +{ + struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); + struct hw_perf_event *hwc = &event->hw; + int idx; + + idx = fixed_mode_idx(event, hwc); + if (idx == X86_PMC_IDX_FIXED_BTS) { + /* BTS is already occupied. */ + if (test_and_set_bit(idx, cpuc->used_mask)) + return -EAGAIN; + + hwc->config_base = 0; + hwc->event_base = 0; + hwc->idx = idx; + } else if (idx >= 0) { + /* + * Try to get the fixed event, if that is already taken + * then try to get a generic event: + */ + if (test_and_set_bit(idx, cpuc->used_mask)) + goto try_generic; + + hwc->config_base = MSR_ARCH_PERFMON_FIXED_CTR_CTRL; + /* + * We set it so that event_base + idx in wrmsr/rdmsr maps to + * MSR_ARCH_PERFMON_FIXED_CTR0 ... CTR2: + */ + hwc->event_base = + MSR_ARCH_PERFMON_FIXED_CTR0 - X86_PMC_IDX_FIXED; + hwc->idx = idx; + } else { + idx = hwc->idx; + /* Try to get the previous generic event again */ + if (test_and_set_bit(idx, cpuc->used_mask)) { +try_generic: + idx = find_first_zero_bit(cpuc->used_mask, + x86_pmu.num_events); + if (idx == x86_pmu.num_events) + return -EAGAIN; + + set_bit(idx, cpuc->used_mask); + hwc->idx = idx; + } + hwc->config_base = x86_pmu.eventsel; + hwc->event_base = x86_pmu.perfctr; + } + + perf_events_lapic_init(); + + x86_pmu.disable(hwc, idx); + + cpuc->events[idx] = event; + set_bit(idx, cpuc->active_mask); + + x86_perf_event_set_period(event, hwc, idx); + x86_pmu.enable(hwc, idx); + + perf_event_update_userpage(event); + + return 0; +} + +static void x86_pmu_unthrottle(struct perf_event *event) +{ + struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); + struct hw_perf_event *hwc = &event->hw; + + if (WARN_ON_ONCE(hwc->idx >= X86_PMC_IDX_MAX || + cpuc->events[hwc->idx] != event)) + return; + + x86_pmu.enable(hwc, hwc->idx); +} + +void perf_event_print_debug(void) +{ + u64 ctrl, status, overflow, pmc_ctrl, pmc_count, prev_left, fixed; + struct cpu_hw_events *cpuc; + unsigned long flags; + int cpu, idx; + + if (!x86_pmu.num_events) + return; + + local_irq_save(flags); + + cpu = smp_processor_id(); + cpuc = &per_cpu(cpu_hw_events, cpu); + + if (x86_pmu.version >= 2) { + rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, ctrl); + rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status); + rdmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, overflow); + rdmsrl(MSR_ARCH_PERFMON_FIXED_CTR_CTRL, fixed); + + pr_info("\n"); + pr_info("CPU#%d: ctrl: %016llx\n", cpu, ctrl); + pr_info("CPU#%d: status: %016llx\n", cpu, status); + pr_info("CPU#%d: overflow: %016llx\n", cpu, overflow); + pr_info("CPU#%d: fixed: %016llx\n", cpu, fixed); + } + pr_info("CPU#%d: used: %016llx\n", cpu, *(u64 *)cpuc->used_mask); + + for (idx = 0; idx < x86_pmu.num_events; idx++) { + rdmsrl(x86_pmu.eventsel + idx, pmc_ctrl); + rdmsrl(x86_pmu.perfctr + idx, pmc_count); + + prev_left = per_cpu(pmc_prev_left[idx], cpu); + + pr_info("CPU#%d: gen-PMC%d ctrl: %016llx\n", + cpu, idx, pmc_ctrl); + pr_info("CPU#%d: gen-PMC%d count: %016llx\n", + cpu, idx, pmc_count); + pr_info("CPU#%d: gen-PMC%d left: %016llx\n", + cpu, idx, prev_left); + } + for (idx = 0; idx < x86_pmu.num_events_fixed; idx++) { + rdmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, pmc_count); + + pr_info("CPU#%d: fixed-PMC%d count: %016llx\n", + cpu, idx, pmc_count); + } + local_irq_restore(flags); +} + +static void intel_pmu_drain_bts_buffer(struct cpu_hw_events *cpuc) +{ + struct debug_store *ds = cpuc->ds; + struct bts_record { + u64 from; + u64 to; + u64 flags; + }; + struct perf_event *event = cpuc->events[X86_PMC_IDX_FIXED_BTS]; + struct bts_record *at, *top; + struct perf_output_handle handle; + struct perf_event_header header; + struct perf_sample_data data; + struct pt_regs regs; + + if (!event) + return; + + if (!ds) + return; + + at = (struct bts_record *)(unsigned long)ds->bts_buffer_base; + top = (struct bts_record *)(unsigned long)ds->bts_index; + + if (top <= at) + return; + + ds->bts_index = ds->bts_buffer_base; + + + data.period = event->hw.last_period; + data.addr = 0; + regs.ip = 0; + + /* + * Prepare a generic sample, i.e. fill in the invariant fields. + * We will overwrite the from and to address before we output + * the sample. + */ + perf_prepare_sample(&header, &data, event, ®s); + + if (perf_output_begin(&handle, event, + header.size * (top - at), 1, 1)) + return; + + for (; at < top; at++) { + data.ip = at->from; + data.addr = at->to; + + perf_output_sample(&handle, &header, &data, event); + } + + perf_output_end(&handle); + + /* There's new data available. */ + event->hw.interrupts++; + event->pending_kill = POLL_IN; +} + +static void x86_pmu_disable(struct perf_event *event) +{ + struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); + struct hw_perf_event *hwc = &event->hw; + int idx = hwc->idx; + + /* + * Must be done before we disable, otherwise the nmi handler + * could reenable again: + */ + clear_bit(idx, cpuc->active_mask); + x86_pmu.disable(hwc, idx); + + /* + * Make sure the cleared pointer becomes visible before we + * (potentially) free the event: + */ + barrier(); + + /* + * Drain the remaining delta count out of a event + * that we are disabling: + */ + x86_perf_event_update(event, hwc, idx); + + /* Drain the remaining BTS records. */ + if (unlikely(idx == X86_PMC_IDX_FIXED_BTS)) + intel_pmu_drain_bts_buffer(cpuc); + + cpuc->events[idx] = NULL; + clear_bit(idx, cpuc->used_mask); + + perf_event_update_userpage(event); +} + +/* + * Save and restart an expired event. Called by NMI contexts, + * so it has to be careful about preempting normal event ops: + */ +static int intel_pmu_save_and_restart(struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + int idx = hwc->idx; + int ret; + + x86_perf_event_update(event, hwc, idx); + ret = x86_perf_event_set_period(event, hwc, idx); + + if (event->state == PERF_EVENT_STATE_ACTIVE) + intel_pmu_enable_event(hwc, idx); + + return ret; +} + +static void intel_pmu_reset(void) +{ + struct debug_store *ds = __get_cpu_var(cpu_hw_events).ds; + unsigned long flags; + int idx; + + if (!x86_pmu.num_events) + return; + + local_irq_save(flags); + + printk("clearing PMU state on CPU#%d\n", smp_processor_id()); + + for (idx = 0; idx < x86_pmu.num_events; idx++) { + checking_wrmsrl(x86_pmu.eventsel + idx, 0ull); + checking_wrmsrl(x86_pmu.perfctr + idx, 0ull); + } + for (idx = 0; idx < x86_pmu.num_events_fixed; idx++) { + checking_wrmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, 0ull); + } + if (ds) + ds->bts_index = ds->bts_buffer_base; + + local_irq_restore(flags); +} + +static int p6_pmu_handle_irq(struct pt_regs *regs) +{ + struct perf_sample_data data; + struct cpu_hw_events *cpuc; + struct perf_event *event; + struct hw_perf_event *hwc; + int idx, handled = 0; + u64 val; + + data.addr = 0; + + cpuc = &__get_cpu_var(cpu_hw_events); + + for (idx = 0; idx < x86_pmu.num_events; idx++) { + if (!test_bit(idx, cpuc->active_mask)) + continue; + + event = cpuc->events[idx]; + hwc = &event->hw; + + val = x86_perf_event_update(event, hwc, idx); + if (val & (1ULL << (x86_pmu.event_bits - 1))) + continue; + + /* + * event overflow + */ + handled = 1; + data.period = event->hw.last_period; + + if (!x86_perf_event_set_period(event, hwc, idx)) + continue; + + if (perf_event_overflow(event, 1, &data, regs)) + p6_pmu_disable_event(hwc, idx); + } + + if (handled) + inc_irq_stat(apic_perf_irqs); + + return handled; +} + +/* + * This handler is triggered by the local APIC, so the APIC IRQ handling + * rules apply: + */ +static int intel_pmu_handle_irq(struct pt_regs *regs) +{ + struct perf_sample_data data; + struct cpu_hw_events *cpuc; + int bit, loops; + u64 ack, status; + + data.addr = 0; + + cpuc = &__get_cpu_var(cpu_hw_events); + + perf_disable(); + intel_pmu_drain_bts_buffer(cpuc); + status = intel_pmu_get_status(); + if (!status) { + perf_enable(); + return 0; + } + + loops = 0; +again: + if (++loops > 100) { + WARN_ONCE(1, "perfevents: irq loop stuck!\n"); + perf_event_print_debug(); + intel_pmu_reset(); + perf_enable(); + return 1; + } + + inc_irq_stat(apic_perf_irqs); + ack = status; + for_each_bit(bit, (unsigned long *)&status, X86_PMC_IDX_MAX) { + struct perf_event *event = cpuc->events[bit]; + + clear_bit(bit, (unsigned long *) &status); + if (!test_bit(bit, cpuc->active_mask)) + continue; + + if (!intel_pmu_save_and_restart(event)) + continue; + + data.period = event->hw.last_period; + + if (perf_event_overflow(event, 1, &data, regs)) + intel_pmu_disable_event(&event->hw, bit); + } + + intel_pmu_ack_status(ack); + + /* + * Repeat if there is more work to be done: + */ + status = intel_pmu_get_status(); + if (status) + goto again; + + perf_enable(); + + return 1; +} + +static int amd_pmu_handle_irq(struct pt_regs *regs) +{ + struct perf_sample_data data; + struct cpu_hw_events *cpuc; + struct perf_event *event; + struct hw_perf_event *hwc; + int idx, handled = 0; + u64 val; + + data.addr = 0; + + cpuc = &__get_cpu_var(cpu_hw_events); + + for (idx = 0; idx < x86_pmu.num_events; idx++) { + if (!test_bit(idx, cpuc->active_mask)) + continue; + + event = cpuc->events[idx]; + hwc = &event->hw; + + val = x86_perf_event_update(event, hwc, idx); + if (val & (1ULL << (x86_pmu.event_bits - 1))) + continue; + + /* + * event overflow + */ + handled = 1; + data.period = event->hw.last_period; + + if (!x86_perf_event_set_period(event, hwc, idx)) + continue; + + if (perf_event_overflow(event, 1, &data, regs)) + amd_pmu_disable_event(hwc, idx); + } + + if (handled) + inc_irq_stat(apic_perf_irqs); + + return handled; +} + +void smp_perf_pending_interrupt(struct pt_regs *regs) +{ + irq_enter(); + ack_APIC_irq(); + inc_irq_stat(apic_pending_irqs); + perf_event_do_pending(); + irq_exit(); +} + +void set_perf_event_pending(void) +{ +#ifdef CONFIG_X86_LOCAL_APIC + apic->send_IPI_self(LOCAL_PENDING_VECTOR); +#endif +} + +void perf_events_lapic_init(void) +{ +#ifdef CONFIG_X86_LOCAL_APIC + if (!x86_pmu.apic || !x86_pmu_initialized()) + return; + + /* + * Always use NMI for PMU + */ + apic_write(APIC_LVTPC, APIC_DM_NMI); +#endif +} + +static int __kprobes +perf_event_nmi_handler(struct notifier_block *self, + unsigned long cmd, void *__args) +{ + struct die_args *args = __args; + struct pt_regs *regs; + + if (!atomic_read(&active_events)) + return NOTIFY_DONE; + + switch (cmd) { + case DIE_NMI: + case DIE_NMI_IPI: + break; + + default: + return NOTIFY_DONE; + } + + regs = args->regs; + +#ifdef CONFIG_X86_LOCAL_APIC + apic_write(APIC_LVTPC, APIC_DM_NMI); +#endif + /* + * Can't rely on the handled return value to say it was our NMI, two + * events could trigger 'simultaneously' raising two back-to-back NMIs. + * + * If the first NMI handles both, the latter will be empty and daze + * the CPU. + */ + x86_pmu.handle_irq(regs); + + return NOTIFY_STOP; +} + +static __read_mostly struct notifier_block perf_event_nmi_notifier = { + .notifier_call = perf_event_nmi_handler, + .next = NULL, + .priority = 1 +}; + +static struct x86_pmu p6_pmu = { + .name = "p6", + .handle_irq = p6_pmu_handle_irq, + .disable_all = p6_pmu_disable_all, + .enable_all = p6_pmu_enable_all, + .enable = p6_pmu_enable_event, + .disable = p6_pmu_disable_event, + .eventsel = MSR_P6_EVNTSEL0, + .perfctr = MSR_P6_PERFCTR0, + .event_map = p6_pmu_event_map, + .raw_event = p6_pmu_raw_event, + .max_events = ARRAY_SIZE(p6_perfmon_event_map), + .apic = 1, + .max_period = (1ULL << 31) - 1, + .version = 0, + .num_events = 2, + /* + * Events have 40 bits implemented. However they are designed such + * that bits [32-39] are sign extensions of bit 31. As such the + * effective width of a event for P6-like PMU is 32 bits only. + * + * See IA-32 Intel Architecture Software developer manual Vol 3B + */ + .event_bits = 32, + .event_mask = (1ULL << 32) - 1, +}; + +static struct x86_pmu intel_pmu = { + .name = "Intel", + .handle_irq = intel_pmu_handle_irq, + .disable_all = intel_pmu_disable_all, + .enable_all = intel_pmu_enable_all, + .enable = intel_pmu_enable_event, + .disable = intel_pmu_disable_event, + .eventsel = MSR_ARCH_PERFMON_EVENTSEL0, + .perfctr = MSR_ARCH_PERFMON_PERFCTR0, + .event_map = intel_pmu_event_map, + .raw_event = intel_pmu_raw_event, + .max_events = ARRAY_SIZE(intel_perfmon_event_map), + .apic = 1, + /* + * Intel PMCs cannot be accessed sanely above 32 bit width, + * so we install an artificial 1<<31 period regardless of + * the generic event period: + */ + .max_period = (1ULL << 31) - 1, + .enable_bts = intel_pmu_enable_bts, + .disable_bts = intel_pmu_disable_bts, +}; + +static struct x86_pmu amd_pmu = { + .name = "AMD", + .handle_irq = amd_pmu_handle_irq, + .disable_all = amd_pmu_disable_all, + .enable_all = amd_pmu_enable_all, + .enable = amd_pmu_enable_event, + .disable = amd_pmu_disable_event, + .eventsel = MSR_K7_EVNTSEL0, + .perfctr = MSR_K7_PERFCTR0, + .event_map = amd_pmu_event_map, + .raw_event = amd_pmu_raw_event, + .max_events = ARRAY_SIZE(amd_perfmon_event_map), + .num_events = 4, + .event_bits = 48, + .event_mask = (1ULL << 48) - 1, + .apic = 1, + /* use highest bit to detect overflow */ + .max_period = (1ULL << 47) - 1, +}; + +static int p6_pmu_init(void) +{ + switch (boot_cpu_data.x86_model) { + case 1: + case 3: /* Pentium Pro */ + case 5: + case 6: /* Pentium II */ + case 7: + case 8: + case 11: /* Pentium III */ + break; + case 9: + case 13: + /* Pentium M */ + break; + default: + pr_cont("unsupported p6 CPU model %d ", + boot_cpu_data.x86_model); + return -ENODEV; + } + + x86_pmu = p6_pmu; + + if (!cpu_has_apic) { + pr_info("no APIC, boot with the \"lapic\" boot parameter to force-enable it.\n"); + pr_info("no hardware sampling interrupt available.\n"); + x86_pmu.apic = 0; + } + + return 0; +} + +static int intel_pmu_init(void) +{ + union cpuid10_edx edx; + union cpuid10_eax eax; + unsigned int unused; + unsigned int ebx; + int version; + + if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) { + /* check for P6 processor family */ + if (boot_cpu_data.x86 == 6) { + return p6_pmu_init(); + } else { + return -ENODEV; + } + } + + /* + * Check whether the Architectural PerfMon supports + * Branch Misses Retired hw_event or not. + */ + cpuid(10, &eax.full, &ebx, &unused, &edx.full); + if (eax.split.mask_length <= ARCH_PERFMON_BRANCH_MISSES_RETIRED) + return -ENODEV; + + version = eax.split.version_id; + if (version < 2) + return -ENODEV; + + x86_pmu = intel_pmu; + x86_pmu.version = version; + x86_pmu.num_events = eax.split.num_events; + x86_pmu.event_bits = eax.split.bit_width; + x86_pmu.event_mask = (1ULL << eax.split.bit_width) - 1; + + /* + * Quirk: v2 perfmon does not report fixed-purpose events, so + * assume at least 3 events: + */ + x86_pmu.num_events_fixed = max((int)edx.split.num_events_fixed, 3); + + /* + * Install the hw-cache-events table: + */ + switch (boot_cpu_data.x86_model) { + case 15: /* original 65 nm celeron/pentium/core2/xeon, "Merom"/"Conroe" */ + case 22: /* single-core 65 nm celeron/core2solo "Merom-L"/"Conroe-L" */ + case 23: /* current 45 nm celeron/core2/xeon "Penryn"/"Wolfdale" */ + case 29: /* six-core 45 nm xeon "Dunnington" */ + memcpy(hw_cache_event_ids, core2_hw_cache_event_ids, + sizeof(hw_cache_event_ids)); + + pr_cont("Core2 events, "); + break; + default: + case 26: + memcpy(hw_cache_event_ids, nehalem_hw_cache_event_ids, + sizeof(hw_cache_event_ids)); + + pr_cont("Nehalem/Corei7 events, "); + break; + case 28: + memcpy(hw_cache_event_ids, atom_hw_cache_event_ids, + sizeof(hw_cache_event_ids)); + + pr_cont("Atom events, "); + break; + } + return 0; +} + +static int amd_pmu_init(void) +{ + /* Performance-monitoring supported from K7 and later: */ + if (boot_cpu_data.x86 < 6) + return -ENODEV; + + x86_pmu = amd_pmu; + + /* Events are common for all AMDs */ + memcpy(hw_cache_event_ids, amd_hw_cache_event_ids, + sizeof(hw_cache_event_ids)); + + return 0; +} + +void __init init_hw_perf_events(void) +{ + int err; + + pr_info("Performance Events: "); + + switch (boot_cpu_data.x86_vendor) { + case X86_VENDOR_INTEL: + err = intel_pmu_init(); + break; + case X86_VENDOR_AMD: + err = amd_pmu_init(); + break; + default: + return; + } + if (err != 0) { + pr_cont("no PMU driver, software events only.\n"); + return; + } + + pr_cont("%s PMU driver.\n", x86_pmu.name); + + if (x86_pmu.num_events > X86_PMC_MAX_GENERIC) { + WARN(1, KERN_ERR "hw perf events %d > max(%d), clipping!", + x86_pmu.num_events, X86_PMC_MAX_GENERIC); + x86_pmu.num_events = X86_PMC_MAX_GENERIC; + } + perf_event_mask = (1 << x86_pmu.num_events) - 1; + perf_max_events = x86_pmu.num_events; + + if (x86_pmu.num_events_fixed > X86_PMC_MAX_FIXED) { + WARN(1, KERN_ERR "hw perf events fixed %d > max(%d), clipping!", + x86_pmu.num_events_fixed, X86_PMC_MAX_FIXED); + x86_pmu.num_events_fixed = X86_PMC_MAX_FIXED; + } + + perf_event_mask |= + ((1LL << x86_pmu.num_events_fixed)-1) << X86_PMC_IDX_FIXED; + x86_pmu.intel_ctrl = perf_event_mask; + + perf_events_lapic_init(); + register_die_notifier(&perf_event_nmi_notifier); + + pr_info("... version: %d\n", x86_pmu.version); + pr_info("... bit width: %d\n", x86_pmu.event_bits); + pr_info("... generic events: %d\n", x86_pmu.num_events); + pr_info("... value mask: %016Lx\n", x86_pmu.event_mask); + pr_info("... max period: %016Lx\n", x86_pmu.max_period); + pr_info("... fixed-purpose events: %d\n", x86_pmu.num_events_fixed); + pr_info("... event mask: %016Lx\n", perf_event_mask); +} + +static inline void x86_pmu_read(struct perf_event *event) +{ + x86_perf_event_update(event, &event->hw, event->hw.idx); +} + +static const struct pmu pmu = { + .enable = x86_pmu_enable, + .disable = x86_pmu_disable, + .read = x86_pmu_read, + .unthrottle = x86_pmu_unthrottle, +}; + +const struct pmu *hw_perf_event_init(struct perf_event *event) +{ + int err; + + err = __hw_perf_event_init(event); + if (err) { + if (event->destroy) + event->destroy(event); + return ERR_PTR(err); + } + + return &pmu; +} + +/* + * callchain support + */ + +static inline +void callchain_store(struct perf_callchain_entry *entry, u64 ip) +{ + if (entry->nr < PERF_MAX_STACK_DEPTH) + entry->ip[entry->nr++] = ip; +} + +static DEFINE_PER_CPU(struct perf_callchain_entry, pmc_irq_entry); +static DEFINE_PER_CPU(struct perf_callchain_entry, pmc_nmi_entry); +static DEFINE_PER_CPU(int, in_nmi_frame); + + +static void +backtrace_warning_symbol(void *data, char *msg, unsigned long symbol) +{ + /* Ignore warnings */ +} + +static void backtrace_warning(void *data, char *msg) +{ + /* Ignore warnings */ +} + +static int backtrace_stack(void *data, char *name) +{ + per_cpu(in_nmi_frame, smp_processor_id()) = + x86_is_stack_id(NMI_STACK, name); + + return 0; +} + +static void backtrace_address(void *data, unsigned long addr, int reliable) +{ + struct perf_callchain_entry *entry = data; + + if (per_cpu(in_nmi_frame, smp_processor_id())) + return; + + if (reliable) + callchain_store(entry, addr); +} + +static const struct stacktrace_ops backtrace_ops = { + .warning = backtrace_warning, + .warning_symbol = backtrace_warning_symbol, + .stack = backtrace_stack, + .address = backtrace_address, +}; + +#include "../dumpstack.h" + +static void +perf_callchain_kernel(struct pt_regs *regs, struct perf_callchain_entry *entry) +{ + callchain_store(entry, PERF_CONTEXT_KERNEL); + callchain_store(entry, regs->ip); + + dump_trace(NULL, regs, NULL, 0, &backtrace_ops, entry); +} + +/* + * best effort, GUP based copy_from_user() that assumes IRQ or NMI context + */ +static unsigned long +copy_from_user_nmi(void *to, const void __user *from, unsigned long n) +{ + unsigned long offset, addr = (unsigned long)from; + int type = in_nmi() ? KM_NMI : KM_IRQ0; + unsigned long size, len = 0; + struct page *page; + void *map; + int ret; + + do { + ret = __get_user_pages_fast(addr, 1, 0, &page); + if (!ret) + break; + + offset = addr & (PAGE_SIZE - 1); + size = min(PAGE_SIZE - offset, n - len); + + map = kmap_atomic(page, type); + memcpy(to, map+offset, size); + kunmap_atomic(map, type); + put_page(page); + + len += size; + to += size; + addr += size; + + } while (len < n); + + return len; +} + +static int copy_stack_frame(const void __user *fp, struct stack_frame *frame) +{ + unsigned long bytes; + + bytes = copy_from_user_nmi(frame, fp, sizeof(*frame)); + + return bytes == sizeof(*frame); +} + +static void +perf_callchain_user(struct pt_regs *regs, struct perf_callchain_entry *entry) +{ + struct stack_frame frame; + const void __user *fp; + + if (!user_mode(regs)) + regs = task_pt_regs(current); + + fp = (void __user *)regs->bp; + + callchain_store(entry, PERF_CONTEXT_USER); + callchain_store(entry, regs->ip); + + while (entry->nr < PERF_MAX_STACK_DEPTH) { + frame.next_frame = NULL; + frame.return_address = 0; + + if (!copy_stack_frame(fp, &frame)) + break; + + if ((unsigned long)fp < regs->sp) + break; + + callchain_store(entry, frame.return_address); + fp = frame.next_frame; + } +} + +static void +perf_do_callchain(struct pt_regs *regs, struct perf_callchain_entry *entry) +{ + int is_user; + + if (!regs) + return; + + is_user = user_mode(regs); + + if (!current || current->pid == 0) + return; + + if (is_user && current->state != TASK_RUNNING) + return; + + if (!is_user) + perf_callchain_kernel(regs, entry); + + if (current->mm) + perf_callchain_user(regs, entry); +} + +struct perf_callchain_entry *perf_callchain(struct pt_regs *regs) +{ + struct perf_callchain_entry *entry; + + if (in_nmi()) + entry = &__get_cpu_var(pmc_nmi_entry); + else + entry = &__get_cpu_var(pmc_irq_entry); + + entry->nr = 0; + + perf_do_callchain(regs, entry); + + return entry; +} + +void hw_perf_event_setup_online(int cpu) +{ + init_debug_store_on_cpu(cpu); +} diff --git a/arch/x86/kernel/cpu/perfctr-watchdog.c b/arch/x86/kernel/cpu/perfctr-watchdog.c index 392bea43b890..fab786f60ed6 100644 --- a/arch/x86/kernel/cpu/perfctr-watchdog.c +++ b/arch/x86/kernel/cpu/perfctr-watchdog.c @@ -20,7 +20,7 @@ #include #include -#include +#include struct nmi_watchdog_ctlblk { unsigned int cccr_msr; diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index d59fe323807e..681c3fda7391 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -1021,7 +1021,7 @@ apicinterrupt ERROR_APIC_VECTOR \ apicinterrupt SPURIOUS_APIC_VECTOR \ spurious_interrupt smp_spurious_interrupt -#ifdef CONFIG_PERF_COUNTERS +#ifdef CONFIG_PERF_EVENTS apicinterrupt LOCAL_PENDING_VECTOR \ perf_pending_interrupt smp_perf_pending_interrupt #endif diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c index 300883112e3d..40f30773fb29 100644 --- a/arch/x86/kernel/irqinit.c +++ b/arch/x86/kernel/irqinit.c @@ -208,7 +208,7 @@ static void __init apic_intr_init(void) alloc_intr_gate(ERROR_APIC_VECTOR, error_interrupt); /* Performance monitoring interrupts: */ -# ifdef CONFIG_PERF_COUNTERS +# ifdef CONFIG_PERF_EVENTS alloc_intr_gate(LOCAL_PENDING_VECTOR, perf_pending_interrupt); # endif diff --git a/arch/x86/kernel/syscall_table_32.S b/arch/x86/kernel/syscall_table_32.S index d51321ddafda..0157cd26d7cc 100644 --- a/arch/x86/kernel/syscall_table_32.S +++ b/arch/x86/kernel/syscall_table_32.S @@ -335,4 +335,4 @@ ENTRY(sys_call_table) .long sys_preadv .long sys_pwritev .long sys_rt_tgsigqueueinfo /* 335 */ - .long sys_perf_counter_open + .long sys_perf_event_open diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 775a020990a5..82728f2c6d55 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -10,7 +10,7 @@ #include /* max_low_pfn */ #include /* __kprobes, ... */ #include /* kmmio_handler, ... */ -#include /* perf_swcounter_event */ +#include /* perf_sw_event */ #include /* dotraplinkage, ... */ #include /* pgd_*(), ... */ @@ -1017,7 +1017,7 @@ do_page_fault(struct pt_regs *regs, unsigned long error_code) if (unlikely(error_code & PF_RSVD)) pgtable_bad(regs, error_code, address); - perf_swcounter_event(PERF_COUNT_SW_PAGE_FAULTS, 1, 0, regs, address); + perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, 0, regs, address); /* * If we're in an interrupt, have no user context or are running @@ -1114,11 +1114,11 @@ good_area: if (fault & VM_FAULT_MAJOR) { tsk->maj_flt++; - perf_swcounter_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, 0, + perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, 0, regs, address); } else { tsk->min_flt++; - perf_swcounter_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, 0, + perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, 0, regs, address); } diff --git a/arch/x86/oprofile/op_model_ppro.c b/arch/x86/oprofile/op_model_ppro.c index 4899215999de..8eb05878554c 100644 --- a/arch/x86/oprofile/op_model_ppro.c +++ b/arch/x86/oprofile/op_model_ppro.c @@ -234,11 +234,11 @@ static void arch_perfmon_setup_counters(void) if (eax.split.version_id == 0 && current_cpu_data.x86 == 6 && current_cpu_data.x86_model == 15) { eax.split.version_id = 2; - eax.split.num_counters = 2; + eax.split.num_events = 2; eax.split.bit_width = 40; } - num_counters = eax.split.num_counters; + num_counters = eax.split.num_events; op_arch_perfmon_spec.num_counters = num_counters; op_arch_perfmon_spec.num_controls = num_counters; diff --git a/arch/x86/oprofile/op_x86_model.h b/arch/x86/oprofile/op_x86_model.h index b83776180c7f..7b8e75d16081 100644 --- a/arch/x86/oprofile/op_x86_model.h +++ b/arch/x86/oprofile/op_x86_model.h @@ -13,7 +13,7 @@ #define OP_X86_MODEL_H #include -#include +#include struct op_msr { unsigned long addr; diff --git a/drivers/char/sysrq.c b/drivers/char/sysrq.c index 50eecfe1d724..44203ff599da 100644 --- a/drivers/char/sysrq.c +++ b/drivers/char/sysrq.c @@ -26,7 +26,7 @@ #include #include #include -#include +#include #include #include #include @@ -252,7 +252,7 @@ static void sysrq_handle_showregs(int key, struct tty_struct *tty) struct pt_regs *regs = get_irq_regs(); if (regs) show_regs(regs); - perf_counter_print_debug(); + perf_event_print_debug(); } static struct sysrq_key_op sysrq_showregs_op = { .handler = sysrq_handle_showregs, diff --git a/fs/exec.c b/fs/exec.c index 172ceb6edde4..434dba778ccc 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -33,7 +33,7 @@ #include #include #include -#include +#include #include #include #include @@ -923,7 +923,7 @@ void set_task_comm(struct task_struct *tsk, char *buf) task_lock(tsk); strlcpy(tsk->comm, buf, sizeof(tsk->comm)); task_unlock(tsk); - perf_counter_comm(tsk); + perf_event_comm(tsk); } int flush_old_exec(struct linux_binprm * bprm) @@ -997,7 +997,7 @@ int flush_old_exec(struct linux_binprm * bprm) * security domain: */ if (!get_dumpable(current->mm)) - perf_counter_exit_task(current); + perf_event_exit_task(current); /* An exec changes our domain. We are no longer part of the thread group */ diff --git a/include/asm-generic/unistd.h b/include/asm-generic/unistd.h index 1125e5a1ee5d..d76b66acea95 100644 --- a/include/asm-generic/unistd.h +++ b/include/asm-generic/unistd.h @@ -620,8 +620,8 @@ __SYSCALL(__NR_move_pages, sys_move_pages) #define __NR_rt_tgsigqueueinfo 240 __SYSCALL(__NR_rt_tgsigqueueinfo, sys_rt_tgsigqueueinfo) -#define __NR_perf_counter_open 241 -__SYSCALL(__NR_perf_counter_open, sys_perf_counter_open) +#define __NR_perf_event_open 241 +__SYSCALL(__NR_perf_event_open, sys_perf_event_open) #undef __NR_syscalls #define __NR_syscalls 242 diff --git a/include/linux/init_task.h b/include/linux/init_task.h index 9e7f2e8fc66e..21a6f5d9af22 100644 --- a/include/linux/init_task.h +++ b/include/linux/init_task.h @@ -106,13 +106,13 @@ extern struct group_info init_groups; extern struct cred init_cred; -#ifdef CONFIG_PERF_COUNTERS -# define INIT_PERF_COUNTERS(tsk) \ - .perf_counter_mutex = \ - __MUTEX_INITIALIZER(tsk.perf_counter_mutex), \ - .perf_counter_list = LIST_HEAD_INIT(tsk.perf_counter_list), +#ifdef CONFIG_PERF_EVENTS +# define INIT_PERF_EVENTS(tsk) \ + .perf_event_mutex = \ + __MUTEX_INITIALIZER(tsk.perf_event_mutex), \ + .perf_event_list = LIST_HEAD_INIT(tsk.perf_event_list), #else -# define INIT_PERF_COUNTERS(tsk) +# define INIT_PERF_EVENTS(tsk) #endif /* @@ -178,7 +178,7 @@ extern struct cred init_cred; }, \ .dirties = INIT_PROP_LOCAL_SINGLE(dirties), \ INIT_IDS \ - INIT_PERF_COUNTERS(tsk) \ + INIT_PERF_EVENTS(tsk) \ INIT_TRACE_IRQFLAGS \ INIT_LOCKDEP \ INIT_FTRACE_GRAPH \ diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h deleted file mode 100644 index f64862732673..000000000000 --- a/include/linux/perf_counter.h +++ /dev/null @@ -1,858 +0,0 @@ -/* - * Performance counters: - * - * Copyright (C) 2008-2009, Thomas Gleixner - * Copyright (C) 2008-2009, Red Hat, Inc., Ingo Molnar - * Copyright (C) 2008-2009, Red Hat, Inc., Peter Zijlstra - * - * Data type definitions, declarations, prototypes. - * - * Started by: Thomas Gleixner and Ingo Molnar - * - * For licencing details see kernel-base/COPYING - */ -#ifndef _LINUX_PERF_COUNTER_H -#define _LINUX_PERF_COUNTER_H - -#include -#include -#include - -/* - * User-space ABI bits: - */ - -/* - * attr.type - */ -enum perf_type_id { - PERF_TYPE_HARDWARE = 0, - PERF_TYPE_SOFTWARE = 1, - PERF_TYPE_TRACEPOINT = 2, - PERF_TYPE_HW_CACHE = 3, - PERF_TYPE_RAW = 4, - - PERF_TYPE_MAX, /* non-ABI */ -}; - -/* - * Generalized performance counter event types, used by the - * attr.event_id parameter of the sys_perf_counter_open() - * syscall: - */ -enum perf_hw_id { - /* - * Common hardware events, generalized by the kernel: - */ - PERF_COUNT_HW_CPU_CYCLES = 0, - PERF_COUNT_HW_INSTRUCTIONS = 1, - PERF_COUNT_HW_CACHE_REFERENCES = 2, - PERF_COUNT_HW_CACHE_MISSES = 3, - PERF_COUNT_HW_BRANCH_INSTRUCTIONS = 4, - PERF_COUNT_HW_BRANCH_MISSES = 5, - PERF_COUNT_HW_BUS_CYCLES = 6, - - PERF_COUNT_HW_MAX, /* non-ABI */ -}; - -/* - * Generalized hardware cache counters: - * - * { L1-D, L1-I, LLC, ITLB, DTLB, BPU } x - * { read, write, prefetch } x - * { accesses, misses } - */ -enum perf_hw_cache_id { - PERF_COUNT_HW_CACHE_L1D = 0, - PERF_COUNT_HW_CACHE_L1I = 1, - PERF_COUNT_HW_CACHE_LL = 2, - PERF_COUNT_HW_CACHE_DTLB = 3, - PERF_COUNT_HW_CACHE_ITLB = 4, - PERF_COUNT_HW_CACHE_BPU = 5, - - PERF_COUNT_HW_CACHE_MAX, /* non-ABI */ -}; - -enum perf_hw_cache_op_id { - PERF_COUNT_HW_CACHE_OP_READ = 0, - PERF_COUNT_HW_CACHE_OP_WRITE = 1, - PERF_COUNT_HW_CACHE_OP_PREFETCH = 2, - - PERF_COUNT_HW_CACHE_OP_MAX, /* non-ABI */ -}; - -enum perf_hw_cache_op_result_id { - PERF_COUNT_HW_CACHE_RESULT_ACCESS = 0, - PERF_COUNT_HW_CACHE_RESULT_MISS = 1, - - PERF_COUNT_HW_CACHE_RESULT_MAX, /* non-ABI */ -}; - -/* - * Special "software" counters provided by the kernel, even if the hardware - * does not support performance counters. These counters measure various - * physical and sw events of the kernel (and allow the profiling of them as - * well): - */ -enum perf_sw_ids { - PERF_COUNT_SW_CPU_CLOCK = 0, - PERF_COUNT_SW_TASK_CLOCK = 1, - PERF_COUNT_SW_PAGE_FAULTS = 2, - PERF_COUNT_SW_CONTEXT_SWITCHES = 3, - PERF_COUNT_SW_CPU_MIGRATIONS = 4, - PERF_COUNT_SW_PAGE_FAULTS_MIN = 5, - PERF_COUNT_SW_PAGE_FAULTS_MAJ = 6, - - PERF_COUNT_SW_MAX, /* non-ABI */ -}; - -/* - * Bits that can be set in attr.sample_type to request information - * in the overflow packets. - */ -enum perf_counter_sample_format { - PERF_SAMPLE_IP = 1U << 0, - PERF_SAMPLE_TID = 1U << 1, - PERF_SAMPLE_TIME = 1U << 2, - PERF_SAMPLE_ADDR = 1U << 3, - PERF_SAMPLE_READ = 1U << 4, - PERF_SAMPLE_CALLCHAIN = 1U << 5, - PERF_SAMPLE_ID = 1U << 6, - PERF_SAMPLE_CPU = 1U << 7, - PERF_SAMPLE_PERIOD = 1U << 8, - PERF_SAMPLE_STREAM_ID = 1U << 9, - PERF_SAMPLE_RAW = 1U << 10, - - PERF_SAMPLE_MAX = 1U << 11, /* non-ABI */ -}; - -/* - * The format of the data returned by read() on a perf counter fd, - * as specified by attr.read_format: - * - * struct read_format { - * { u64 value; - * { u64 time_enabled; } && PERF_FORMAT_ENABLED - * { u64 time_running; } && PERF_FORMAT_RUNNING - * { u64 id; } && PERF_FORMAT_ID - * } && !PERF_FORMAT_GROUP - * - * { u64 nr; - * { u64 time_enabled; } && PERF_FORMAT_ENABLED - * { u64 time_running; } && PERF_FORMAT_RUNNING - * { u64 value; - * { u64 id; } && PERF_FORMAT_ID - * } cntr[nr]; - * } && PERF_FORMAT_GROUP - * }; - */ -enum perf_counter_read_format { - PERF_FORMAT_TOTAL_TIME_ENABLED = 1U << 0, - PERF_FORMAT_TOTAL_TIME_RUNNING = 1U << 1, - PERF_FORMAT_ID = 1U << 2, - PERF_FORMAT_GROUP = 1U << 3, - - PERF_FORMAT_MAX = 1U << 4, /* non-ABI */ -}; - -#define PERF_ATTR_SIZE_VER0 64 /* sizeof first published struct */ - -/* - * Hardware event to monitor via a performance monitoring counter: - */ -struct perf_counter_attr { - - /* - * Major type: hardware/software/tracepoint/etc. - */ - __u32 type; - - /* - * Size of the attr structure, for fwd/bwd compat. - */ - __u32 size; - - /* - * Type specific configuration information. - */ - __u64 config; - - union { - __u64 sample_period; - __u64 sample_freq; - }; - - __u64 sample_type; - __u64 read_format; - - __u64 disabled : 1, /* off by default */ - inherit : 1, /* children inherit it */ - pinned : 1, /* must always be on PMU */ - exclusive : 1, /* only group on PMU */ - exclude_user : 1, /* don't count user */ - exclude_kernel : 1, /* ditto kernel */ - exclude_hv : 1, /* ditto hypervisor */ - exclude_idle : 1, /* don't count when idle */ - mmap : 1, /* include mmap data */ - comm : 1, /* include comm data */ - freq : 1, /* use freq, not period */ - inherit_stat : 1, /* per task counts */ - enable_on_exec : 1, /* next exec enables */ - task : 1, /* trace fork/exit */ - watermark : 1, /* wakeup_watermark */ - - __reserved_1 : 49; - - union { - __u32 wakeup_events; /* wakeup every n events */ - __u32 wakeup_watermark; /* bytes before wakeup */ - }; - __u32 __reserved_2; - - __u64 __reserved_3; -}; - -/* - * Ioctls that can be done on a perf counter fd: - */ -#define PERF_COUNTER_IOC_ENABLE _IO ('$', 0) -#define PERF_COUNTER_IOC_DISABLE _IO ('$', 1) -#define PERF_COUNTER_IOC_REFRESH _IO ('$', 2) -#define PERF_COUNTER_IOC_RESET _IO ('$', 3) -#define PERF_COUNTER_IOC_PERIOD _IOW('$', 4, u64) -#define PERF_COUNTER_IOC_SET_OUTPUT _IO ('$', 5) - -enum perf_counter_ioc_flags { - PERF_IOC_FLAG_GROUP = 1U << 0, -}; - -/* - * Structure of the page that can be mapped via mmap - */ -struct perf_counter_mmap_page { - __u32 version; /* version number of this structure */ - __u32 compat_version; /* lowest version this is compat with */ - - /* - * Bits needed to read the hw counters in user-space. - * - * u32 seq; - * s64 count; - * - * do { - * seq = pc->lock; - * - * barrier() - * if (pc->index) { - * count = pmc_read(pc->index - 1); - * count += pc->offset; - * } else - * goto regular_read; - * - * barrier(); - * } while (pc->lock != seq); - * - * NOTE: for obvious reason this only works on self-monitoring - * processes. - */ - __u32 lock; /* seqlock for synchronization */ - __u32 index; /* hardware counter identifier */ - __s64 offset; /* add to hardware counter value */ - __u64 time_enabled; /* time counter active */ - __u64 time_running; /* time counter on cpu */ - - /* - * Hole for extension of the self monitor capabilities - */ - - __u64 __reserved[123]; /* align to 1k */ - - /* - * Control data for the mmap() data buffer. - * - * User-space reading the @data_head value should issue an rmb(), on - * SMP capable platforms, after reading this value -- see - * perf_counter_wakeup(). - * - * When the mapping is PROT_WRITE the @data_tail value should be - * written by userspace to reflect the last read data. In this case - * the kernel will not over-write unread data. - */ - __u64 data_head; /* head in the data section */ - __u64 data_tail; /* user-space written tail */ -}; - -#define PERF_EVENT_MISC_CPUMODE_MASK (3 << 0) -#define PERF_EVENT_MISC_CPUMODE_UNKNOWN (0 << 0) -#define PERF_EVENT_MISC_KERNEL (1 << 0) -#define PERF_EVENT_MISC_USER (2 << 0) -#define PERF_EVENT_MISC_HYPERVISOR (3 << 0) - -struct perf_event_header { - __u32 type; - __u16 misc; - __u16 size; -}; - -enum perf_event_type { - - /* - * The MMAP events record the PROT_EXEC mappings so that we can - * correlate userspace IPs to code. They have the following structure: - * - * struct { - * struct perf_event_header header; - * - * u32 pid, tid; - * u64 addr; - * u64 len; - * u64 pgoff; - * char filename[]; - * }; - */ - PERF_EVENT_MMAP = 1, - - /* - * struct { - * struct perf_event_header header; - * u64 id; - * u64 lost; - * }; - */ - PERF_EVENT_LOST = 2, - - /* - * struct { - * struct perf_event_header header; - * - * u32 pid, tid; - * char comm[]; - * }; - */ - PERF_EVENT_COMM = 3, - - /* - * struct { - * struct perf_event_header header; - * u32 pid, ppid; - * u32 tid, ptid; - * u64 time; - * }; - */ - PERF_EVENT_EXIT = 4, - - /* - * struct { - * struct perf_event_header header; - * u64 time; - * u64 id; - * u64 stream_id; - * }; - */ - PERF_EVENT_THROTTLE = 5, - PERF_EVENT_UNTHROTTLE = 6, - - /* - * struct { - * struct perf_event_header header; - * u32 pid, ppid; - * u32 tid, ptid; - * { u64 time; } && PERF_SAMPLE_TIME - * }; - */ - PERF_EVENT_FORK = 7, - - /* - * struct { - * struct perf_event_header header; - * u32 pid, tid; - * - * struct read_format values; - * }; - */ - PERF_EVENT_READ = 8, - - /* - * struct { - * struct perf_event_header header; - * - * { u64 ip; } && PERF_SAMPLE_IP - * { u32 pid, tid; } && PERF_SAMPLE_TID - * { u64 time; } && PERF_SAMPLE_TIME - * { u64 addr; } && PERF_SAMPLE_ADDR - * { u64 id; } && PERF_SAMPLE_ID - * { u64 stream_id;} && PERF_SAMPLE_STREAM_ID - * { u32 cpu, res; } && PERF_SAMPLE_CPU - * { u64 period; } && PERF_SAMPLE_PERIOD - * - * { struct read_format values; } && PERF_SAMPLE_READ - * - * { u64 nr, - * u64 ips[nr]; } && PERF_SAMPLE_CALLCHAIN - * - * # - * # The RAW record below is opaque data wrt the ABI - * # - * # That is, the ABI doesn't make any promises wrt to - * # the stability of its content, it may vary depending - * # on event, hardware, kernel version and phase of - * # the moon. - * # - * # In other words, PERF_SAMPLE_RAW contents are not an ABI. - * # - * - * { u32 size; - * char data[size];}&& PERF_SAMPLE_RAW - * }; - */ - PERF_EVENT_SAMPLE = 9, - - PERF_EVENT_MAX, /* non-ABI */ -}; - -enum perf_callchain_context { - PERF_CONTEXT_HV = (__u64)-32, - PERF_CONTEXT_KERNEL = (__u64)-128, - PERF_CONTEXT_USER = (__u64)-512, - - PERF_CONTEXT_GUEST = (__u64)-2048, - PERF_CONTEXT_GUEST_KERNEL = (__u64)-2176, - PERF_CONTEXT_GUEST_USER = (__u64)-2560, - - PERF_CONTEXT_MAX = (__u64)-4095, -}; - -#define PERF_FLAG_FD_NO_GROUP (1U << 0) -#define PERF_FLAG_FD_OUTPUT (1U << 1) - -#ifdef __KERNEL__ -/* - * Kernel-internal data types and definitions: - */ - -#ifdef CONFIG_PERF_COUNTERS -# include -#endif - -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#define PERF_MAX_STACK_DEPTH 255 - -struct perf_callchain_entry { - __u64 nr; - __u64 ip[PERF_MAX_STACK_DEPTH]; -}; - -struct perf_raw_record { - u32 size; - void *data; -}; - -struct task_struct; - -/** - * struct hw_perf_counter - performance counter hardware details: - */ -struct hw_perf_counter { -#ifdef CONFIG_PERF_COUNTERS - union { - struct { /* hardware */ - u64 config; - unsigned long config_base; - unsigned long counter_base; - int idx; - }; - union { /* software */ - atomic64_t count; - struct hrtimer hrtimer; - }; - }; - atomic64_t prev_count; - u64 sample_period; - u64 last_period; - atomic64_t period_left; - u64 interrupts; - - u64 freq_count; - u64 freq_interrupts; - u64 freq_stamp; -#endif -}; - -struct perf_counter; - -/** - * struct pmu - generic performance monitoring unit - */ -struct pmu { - int (*enable) (struct perf_counter *counter); - void (*disable) (struct perf_counter *counter); - void (*read) (struct perf_counter *counter); - void (*unthrottle) (struct perf_counter *counter); -}; - -/** - * enum perf_counter_active_state - the states of a counter - */ -enum perf_counter_active_state { - PERF_COUNTER_STATE_ERROR = -2, - PERF_COUNTER_STATE_OFF = -1, - PERF_COUNTER_STATE_INACTIVE = 0, - PERF_COUNTER_STATE_ACTIVE = 1, -}; - -struct file; - -struct perf_mmap_data { - struct rcu_head rcu_head; - int nr_pages; /* nr of data pages */ - int writable; /* are we writable */ - int nr_locked; /* nr pages mlocked */ - - atomic_t poll; /* POLL_ for wakeups */ - atomic_t events; /* event limit */ - - atomic_long_t head; /* write position */ - atomic_long_t done_head; /* completed head */ - - atomic_t lock; /* concurrent writes */ - atomic_t wakeup; /* needs a wakeup */ - atomic_t lost; /* nr records lost */ - - long watermark; /* wakeup watermark */ - - struct perf_counter_mmap_page *user_page; - void *data_pages[0]; -}; - -struct perf_pending_entry { - struct perf_pending_entry *next; - void (*func)(struct perf_pending_entry *); -}; - -/** - * struct perf_counter - performance counter kernel representation: - */ -struct perf_counter { -#ifdef CONFIG_PERF_COUNTERS - struct list_head group_entry; - struct list_head event_entry; - struct list_head sibling_list; - int nr_siblings; - struct perf_counter *group_leader; - struct perf_counter *output; - const struct pmu *pmu; - - enum perf_counter_active_state state; - atomic64_t count; - - /* - * These are the total time in nanoseconds that the counter - * has been enabled (i.e. eligible to run, and the task has - * been scheduled in, if this is a per-task counter) - * and running (scheduled onto the CPU), respectively. - * - * They are computed from tstamp_enabled, tstamp_running and - * tstamp_stopped when the counter is in INACTIVE or ACTIVE state. - */ - u64 total_time_enabled; - u64 total_time_running; - - /* - * These are timestamps used for computing total_time_enabled - * and total_time_running when the counter is in INACTIVE or - * ACTIVE state, measured in nanoseconds from an arbitrary point - * in time. - * tstamp_enabled: the notional time when the counter was enabled - * tstamp_running: the notional time when the counter was scheduled on - * tstamp_stopped: in INACTIVE state, the notional time when the - * counter was scheduled off. - */ - u64 tstamp_enabled; - u64 tstamp_running; - u64 tstamp_stopped; - - struct perf_counter_attr attr; - struct hw_perf_counter hw; - - struct perf_counter_context *ctx; - struct file *filp; - - /* - * These accumulate total time (in nanoseconds) that children - * counters have been enabled and running, respectively. - */ - atomic64_t child_total_time_enabled; - atomic64_t child_total_time_running; - - /* - * Protect attach/detach and child_list: - */ - struct mutex child_mutex; - struct list_head child_list; - struct perf_counter *parent; - - int oncpu; - int cpu; - - struct list_head owner_entry; - struct task_struct *owner; - - /* mmap bits */ - struct mutex mmap_mutex; - atomic_t mmap_count; - struct perf_mmap_data *data; - - /* poll related */ - wait_queue_head_t waitq; - struct fasync_struct *fasync; - - /* delayed work for NMIs and such */ - int pending_wakeup; - int pending_kill; - int pending_disable; - struct perf_pending_entry pending; - - atomic_t event_limit; - - void (*destroy)(struct perf_counter *); - struct rcu_head rcu_head; - - struct pid_namespace *ns; - u64 id; -#endif -}; - -/** - * struct perf_counter_context - counter context structure - * - * Used as a container for task counters and CPU counters as well: - */ -struct perf_counter_context { - /* - * Protect the states of the counters in the list, - * nr_active, and the list: - */ - spinlock_t lock; - /* - * Protect the list of counters. Locking either mutex or lock - * is sufficient to ensure the list doesn't change; to change - * the list you need to lock both the mutex and the spinlock. - */ - struct mutex mutex; - - struct list_head group_list; - struct list_head event_list; - int nr_counters; - int nr_active; - int is_active; - int nr_stat; - atomic_t refcount; - struct task_struct *task; - - /* - * Context clock, runs when context enabled. - */ - u64 time; - u64 timestamp; - - /* - * These fields let us detect when two contexts have both - * been cloned (inherited) from a common ancestor. - */ - struct perf_counter_context *parent_ctx; - u64 parent_gen; - u64 generation; - int pin_count; - struct rcu_head rcu_head; -}; - -/** - * struct perf_counter_cpu_context - per cpu counter context structure - */ -struct perf_cpu_context { - struct perf_counter_context ctx; - struct perf_counter_context *task_ctx; - int active_oncpu; - int max_pertask; - int exclusive; - - /* - * Recursion avoidance: - * - * task, softirq, irq, nmi context - */ - int recursion[4]; -}; - -struct perf_output_handle { - struct perf_counter *counter; - struct perf_mmap_data *data; - unsigned long head; - unsigned long offset; - int nmi; - int sample; - int locked; - unsigned long flags; -}; - -#ifdef CONFIG_PERF_COUNTERS - -/* - * Set by architecture code: - */ -extern int perf_max_counters; - -extern const struct pmu *hw_perf_counter_init(struct perf_counter *counter); - -extern void perf_counter_task_sched_in(struct task_struct *task, int cpu); -extern void perf_counter_task_sched_out(struct task_struct *task, - struct task_struct *next, int cpu); -extern void perf_counter_task_tick(struct task_struct *task, int cpu); -extern int perf_counter_init_task(struct task_struct *child); -extern void perf_counter_exit_task(struct task_struct *child); -extern void perf_counter_free_task(struct task_struct *task); -extern void set_perf_counter_pending(void); -extern void perf_counter_do_pending(void); -extern void perf_counter_print_debug(void); -extern void __perf_disable(void); -extern bool __perf_enable(void); -extern void perf_disable(void); -extern void perf_enable(void); -extern int perf_counter_task_disable(void); -extern int perf_counter_task_enable(void); -extern int hw_perf_group_sched_in(struct perf_counter *group_leader, - struct perf_cpu_context *cpuctx, - struct perf_counter_context *ctx, int cpu); -extern void perf_counter_update_userpage(struct perf_counter *counter); - -struct perf_sample_data { - u64 type; - - u64 ip; - struct { - u32 pid; - u32 tid; - } tid_entry; - u64 time; - u64 addr; - u64 id; - u64 stream_id; - struct { - u32 cpu; - u32 reserved; - } cpu_entry; - u64 period; - struct perf_callchain_entry *callchain; - struct perf_raw_record *raw; -}; - -extern void perf_output_sample(struct perf_output_handle *handle, - struct perf_event_header *header, - struct perf_sample_data *data, - struct perf_counter *counter); -extern void perf_prepare_sample(struct perf_event_header *header, - struct perf_sample_data *data, - struct perf_counter *counter, - struct pt_regs *regs); - -extern int perf_counter_overflow(struct perf_counter *counter, int nmi, - struct perf_sample_data *data, - struct pt_regs *regs); - -/* - * Return 1 for a software counter, 0 for a hardware counter - */ -static inline int is_software_counter(struct perf_counter *counter) -{ - return (counter->attr.type != PERF_TYPE_RAW) && - (counter->attr.type != PERF_TYPE_HARDWARE) && - (counter->attr.type != PERF_TYPE_HW_CACHE); -} - -extern atomic_t perf_swcounter_enabled[PERF_COUNT_SW_MAX]; - -extern void __perf_swcounter_event(u32, u64, int, struct pt_regs *, u64); - -static inline void -perf_swcounter_event(u32 event, u64 nr, int nmi, struct pt_regs *regs, u64 addr) -{ - if (atomic_read(&perf_swcounter_enabled[event])) - __perf_swcounter_event(event, nr, nmi, regs, addr); -} - -extern void __perf_counter_mmap(struct vm_area_struct *vma); - -static inline void perf_counter_mmap(struct vm_area_struct *vma) -{ - if (vma->vm_flags & VM_EXEC) - __perf_counter_mmap(vma); -} - -extern void perf_counter_comm(struct task_struct *tsk); -extern void perf_counter_fork(struct task_struct *tsk); - -extern struct perf_callchain_entry *perf_callchain(struct pt_regs *regs); - -extern int sysctl_perf_counter_paranoid; -extern int sysctl_perf_counter_mlock; -extern int sysctl_perf_counter_sample_rate; - -extern void perf_counter_init(void); -extern void perf_tpcounter_event(int event_id, u64 addr, u64 count, - void *record, int entry_size); - -#ifndef perf_misc_flags -#define perf_misc_flags(regs) (user_mode(regs) ? PERF_EVENT_MISC_USER : \ - PERF_EVENT_MISC_KERNEL) -#define perf_instruction_pointer(regs) instruction_pointer(regs) -#endif - -extern int perf_output_begin(struct perf_output_handle *handle, - struct perf_counter *counter, unsigned int size, - int nmi, int sample); -extern void perf_output_end(struct perf_output_handle *handle); -extern void perf_output_copy(struct perf_output_handle *handle, - const void *buf, unsigned int len); -#else -static inline void -perf_counter_task_sched_in(struct task_struct *task, int cpu) { } -static inline void -perf_counter_task_sched_out(struct task_struct *task, - struct task_struct *next, int cpu) { } -static inline void -perf_counter_task_tick(struct task_struct *task, int cpu) { } -static inline int perf_counter_init_task(struct task_struct *child) { return 0; } -static inline void perf_counter_exit_task(struct task_struct *child) { } -static inline void perf_counter_free_task(struct task_struct *task) { } -static inline void perf_counter_do_pending(void) { } -static inline void perf_counter_print_debug(void) { } -static inline void perf_disable(void) { } -static inline void perf_enable(void) { } -static inline int perf_counter_task_disable(void) { return -EINVAL; } -static inline int perf_counter_task_enable(void) { return -EINVAL; } - -static inline void -perf_swcounter_event(u32 event, u64 nr, int nmi, - struct pt_regs *regs, u64 addr) { } - -static inline void perf_counter_mmap(struct vm_area_struct *vma) { } -static inline void perf_counter_comm(struct task_struct *tsk) { } -static inline void perf_counter_fork(struct task_struct *tsk) { } -static inline void perf_counter_init(void) { } - -#endif - -#define perf_output_put(handle, x) \ - perf_output_copy((handle), &(x), sizeof(x)) - -#endif /* __KERNEL__ */ -#endif /* _LINUX_PERF_COUNTER_H */ diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h new file mode 100644 index 000000000000..ae9d9ed6df2a --- /dev/null +++ b/include/linux/perf_event.h @@ -0,0 +1,858 @@ +/* + * Performance events: + * + * Copyright (C) 2008-2009, Thomas Gleixner + * Copyright (C) 2008-2009, Red Hat, Inc., Ingo Molnar + * Copyright (C) 2008-2009, Red Hat, Inc., Peter Zijlstra + * + * Data type definitions, declarations, prototypes. + * + * Started by: Thomas Gleixner and Ingo Molnar + * + * For licencing details see kernel-base/COPYING + */ +#ifndef _LINUX_PERF_EVENT_H +#define _LINUX_PERF_EVENT_H + +#include +#include +#include + +/* + * User-space ABI bits: + */ + +/* + * attr.type + */ +enum perf_type_id { + PERF_TYPE_HARDWARE = 0, + PERF_TYPE_SOFTWARE = 1, + PERF_TYPE_TRACEPOINT = 2, + PERF_TYPE_HW_CACHE = 3, + PERF_TYPE_RAW = 4, + + PERF_TYPE_MAX, /* non-ABI */ +}; + +/* + * Generalized performance event event_id types, used by the + * attr.event_id parameter of the sys_perf_event_open() + * syscall: + */ +enum perf_hw_id { + /* + * Common hardware events, generalized by the kernel: + */ + PERF_COUNT_HW_CPU_CYCLES = 0, + PERF_COUNT_HW_INSTRUCTIONS = 1, + PERF_COUNT_HW_CACHE_REFERENCES = 2, + PERF_COUNT_HW_CACHE_MISSES = 3, + PERF_COUNT_HW_BRANCH_INSTRUCTIONS = 4, + PERF_COUNT_HW_BRANCH_MISSES = 5, + PERF_COUNT_HW_BUS_CYCLES = 6, + + PERF_COUNT_HW_MAX, /* non-ABI */ +}; + +/* + * Generalized hardware cache events: + * + * { L1-D, L1-I, LLC, ITLB, DTLB, BPU } x + * { read, write, prefetch } x + * { accesses, misses } + */ +enum perf_hw_cache_id { + PERF_COUNT_HW_CACHE_L1D = 0, + PERF_COUNT_HW_CACHE_L1I = 1, + PERF_COUNT_HW_CACHE_LL = 2, + PERF_COUNT_HW_CACHE_DTLB = 3, + PERF_COUNT_HW_CACHE_ITLB = 4, + PERF_COUNT_HW_CACHE_BPU = 5, + + PERF_COUNT_HW_CACHE_MAX, /* non-ABI */ +}; + +enum perf_hw_cache_op_id { + PERF_COUNT_HW_CACHE_OP_READ = 0, + PERF_COUNT_HW_CACHE_OP_WRITE = 1, + PERF_COUNT_HW_CACHE_OP_PREFETCH = 2, + + PERF_COUNT_HW_CACHE_OP_MAX, /* non-ABI */ +}; + +enum perf_hw_cache_op_result_id { + PERF_COUNT_HW_CACHE_RESULT_ACCESS = 0, + PERF_COUNT_HW_CACHE_RESULT_MISS = 1, + + PERF_COUNT_HW_CACHE_RESULT_MAX, /* non-ABI */ +}; + +/* + * Special "software" events provided by the kernel, even if the hardware + * does not support performance events. These events measure various + * physical and sw events of the kernel (and allow the profiling of them as + * well): + */ +enum perf_sw_ids { + PERF_COUNT_SW_CPU_CLOCK = 0, + PERF_COUNT_SW_TASK_CLOCK = 1, + PERF_COUNT_SW_PAGE_FAULTS = 2, + PERF_COUNT_SW_CONTEXT_SWITCHES = 3, + PERF_COUNT_SW_CPU_MIGRATIONS = 4, + PERF_COUNT_SW_PAGE_FAULTS_MIN = 5, + PERF_COUNT_SW_PAGE_FAULTS_MAJ = 6, + + PERF_COUNT_SW_MAX, /* non-ABI */ +}; + +/* + * Bits that can be set in attr.sample_type to request information + * in the overflow packets. + */ +enum perf_event_sample_format { + PERF_SAMPLE_IP = 1U << 0, + PERF_SAMPLE_TID = 1U << 1, + PERF_SAMPLE_TIME = 1U << 2, + PERF_SAMPLE_ADDR = 1U << 3, + PERF_SAMPLE_READ = 1U << 4, + PERF_SAMPLE_CALLCHAIN = 1U << 5, + PERF_SAMPLE_ID = 1U << 6, + PERF_SAMPLE_CPU = 1U << 7, + PERF_SAMPLE_PERIOD = 1U << 8, + PERF_SAMPLE_STREAM_ID = 1U << 9, + PERF_SAMPLE_RAW = 1U << 10, + + PERF_SAMPLE_MAX = 1U << 11, /* non-ABI */ +}; + +/* + * The format of the data returned by read() on a perf event fd, + * as specified by attr.read_format: + * + * struct read_format { + * { u64 value; + * { u64 time_enabled; } && PERF_FORMAT_ENABLED + * { u64 time_running; } && PERF_FORMAT_RUNNING + * { u64 id; } && PERF_FORMAT_ID + * } && !PERF_FORMAT_GROUP + * + * { u64 nr; + * { u64 time_enabled; } && PERF_FORMAT_ENABLED + * { u64 time_running; } && PERF_FORMAT_RUNNING + * { u64 value; + * { u64 id; } && PERF_FORMAT_ID + * } cntr[nr]; + * } && PERF_FORMAT_GROUP + * }; + */ +enum perf_event_read_format { + PERF_FORMAT_TOTAL_TIME_ENABLED = 1U << 0, + PERF_FORMAT_TOTAL_TIME_RUNNING = 1U << 1, + PERF_FORMAT_ID = 1U << 2, + PERF_FORMAT_GROUP = 1U << 3, + + PERF_FORMAT_MAX = 1U << 4, /* non-ABI */ +}; + +#define PERF_ATTR_SIZE_VER0 64 /* sizeof first published struct */ + +/* + * Hardware event_id to monitor via a performance monitoring event: + */ +struct perf_event_attr { + + /* + * Major type: hardware/software/tracepoint/etc. + */ + __u32 type; + + /* + * Size of the attr structure, for fwd/bwd compat. + */ + __u32 size; + + /* + * Type specific configuration information. + */ + __u64 config; + + union { + __u64 sample_period; + __u64 sample_freq; + }; + + __u64 sample_type; + __u64 read_format; + + __u64 disabled : 1, /* off by default */ + inherit : 1, /* children inherit it */ + pinned : 1, /* must always be on PMU */ + exclusive : 1, /* only group on PMU */ + exclude_user : 1, /* don't count user */ + exclude_kernel : 1, /* ditto kernel */ + exclude_hv : 1, /* ditto hypervisor */ + exclude_idle : 1, /* don't count when idle */ + mmap : 1, /* include mmap data */ + comm : 1, /* include comm data */ + freq : 1, /* use freq, not period */ + inherit_stat : 1, /* per task counts */ + enable_on_exec : 1, /* next exec enables */ + task : 1, /* trace fork/exit */ + watermark : 1, /* wakeup_watermark */ + + __reserved_1 : 49; + + union { + __u32 wakeup_events; /* wakeup every n events */ + __u32 wakeup_watermark; /* bytes before wakeup */ + }; + __u32 __reserved_2; + + __u64 __reserved_3; +}; + +/* + * Ioctls that can be done on a perf event fd: + */ +#define PERF_EVENT_IOC_ENABLE _IO ('$', 0) +#define PERF_EVENT_IOC_DISABLE _IO ('$', 1) +#define PERF_EVENT_IOC_REFRESH _IO ('$', 2) +#define PERF_EVENT_IOC_RESET _IO ('$', 3) +#define PERF_EVENT_IOC_PERIOD _IOW('$', 4, u64) +#define PERF_EVENT_IOC_SET_OUTPUT _IO ('$', 5) + +enum perf_event_ioc_flags { + PERF_IOC_FLAG_GROUP = 1U << 0, +}; + +/* + * Structure of the page that can be mapped via mmap + */ +struct perf_event_mmap_page { + __u32 version; /* version number of this structure */ + __u32 compat_version; /* lowest version this is compat with */ + + /* + * Bits needed to read the hw events in user-space. + * + * u32 seq; + * s64 count; + * + * do { + * seq = pc->lock; + * + * barrier() + * if (pc->index) { + * count = pmc_read(pc->index - 1); + * count += pc->offset; + * } else + * goto regular_read; + * + * barrier(); + * } while (pc->lock != seq); + * + * NOTE: for obvious reason this only works on self-monitoring + * processes. + */ + __u32 lock; /* seqlock for synchronization */ + __u32 index; /* hardware event identifier */ + __s64 offset; /* add to hardware event value */ + __u64 time_enabled; /* time event active */ + __u64 time_running; /* time event on cpu */ + + /* + * Hole for extension of the self monitor capabilities + */ + + __u64 __reserved[123]; /* align to 1k */ + + /* + * Control data for the mmap() data buffer. + * + * User-space reading the @data_head value should issue an rmb(), on + * SMP capable platforms, after reading this value -- see + * perf_event_wakeup(). + * + * When the mapping is PROT_WRITE the @data_tail value should be + * written by userspace to reflect the last read data. In this case + * the kernel will not over-write unread data. + */ + __u64 data_head; /* head in the data section */ + __u64 data_tail; /* user-space written tail */ +}; + +#define PERF_RECORD_MISC_CPUMODE_MASK (3 << 0) +#define PERF_RECORD_MISC_CPUMODE_UNKNOWN (0 << 0) +#define PERF_RECORD_MISC_KERNEL (1 << 0) +#define PERF_RECORD_MISC_USER (2 << 0) +#define PERF_RECORD_MISC_HYPERVISOR (3 << 0) + +struct perf_event_header { + __u32 type; + __u16 misc; + __u16 size; +}; + +enum perf_event_type { + + /* + * The MMAP events record the PROT_EXEC mappings so that we can + * correlate userspace IPs to code. They have the following structure: + * + * struct { + * struct perf_event_header header; + * + * u32 pid, tid; + * u64 addr; + * u64 len; + * u64 pgoff; + * char filename[]; + * }; + */ + PERF_RECORD_MMAP = 1, + + /* + * struct { + * struct perf_event_header header; + * u64 id; + * u64 lost; + * }; + */ + PERF_RECORD_LOST = 2, + + /* + * struct { + * struct perf_event_header header; + * + * u32 pid, tid; + * char comm[]; + * }; + */ + PERF_RECORD_COMM = 3, + + /* + * struct { + * struct perf_event_header header; + * u32 pid, ppid; + * u32 tid, ptid; + * u64 time; + * }; + */ + PERF_RECORD_EXIT = 4, + + /* + * struct { + * struct perf_event_header header; + * u64 time; + * u64 id; + * u64 stream_id; + * }; + */ + PERF_RECORD_THROTTLE = 5, + PERF_RECORD_UNTHROTTLE = 6, + + /* + * struct { + * struct perf_event_header header; + * u32 pid, ppid; + * u32 tid, ptid; + * { u64 time; } && PERF_SAMPLE_TIME + * }; + */ + PERF_RECORD_FORK = 7, + + /* + * struct { + * struct perf_event_header header; + * u32 pid, tid; + * + * struct read_format values; + * }; + */ + PERF_RECORD_READ = 8, + + /* + * struct { + * struct perf_event_header header; + * + * { u64 ip; } && PERF_SAMPLE_IP + * { u32 pid, tid; } && PERF_SAMPLE_TID + * { u64 time; } && PERF_SAMPLE_TIME + * { u64 addr; } && PERF_SAMPLE_ADDR + * { u64 id; } && PERF_SAMPLE_ID + * { u64 stream_id;} && PERF_SAMPLE_STREAM_ID + * { u32 cpu, res; } && PERF_SAMPLE_CPU + * { u64 period; } && PERF_SAMPLE_PERIOD + * + * { struct read_format values; } && PERF_SAMPLE_READ + * + * { u64 nr, + * u64 ips[nr]; } && PERF_SAMPLE_CALLCHAIN + * + * # + * # The RAW record below is opaque data wrt the ABI + * # + * # That is, the ABI doesn't make any promises wrt to + * # the stability of its content, it may vary depending + * # on event_id, hardware, kernel version and phase of + * # the moon. + * # + * # In other words, PERF_SAMPLE_RAW contents are not an ABI. + * # + * + * { u32 size; + * char data[size];}&& PERF_SAMPLE_RAW + * }; + */ + PERF_RECORD_SAMPLE = 9, + + PERF_RECORD_MAX, /* non-ABI */ +}; + +enum perf_callchain_context { + PERF_CONTEXT_HV = (__u64)-32, + PERF_CONTEXT_KERNEL = (__u64)-128, + PERF_CONTEXT_USER = (__u64)-512, + + PERF_CONTEXT_GUEST = (__u64)-2048, + PERF_CONTEXT_GUEST_KERNEL = (__u64)-2176, + PERF_CONTEXT_GUEST_USER = (__u64)-2560, + + PERF_CONTEXT_MAX = (__u64)-4095, +}; + +#define PERF_FLAG_FD_NO_GROUP (1U << 0) +#define PERF_FLAG_FD_OUTPUT (1U << 1) + +#ifdef __KERNEL__ +/* + * Kernel-internal data types and definitions: + */ + +#ifdef CONFIG_PERF_EVENTS +# include +#endif + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define PERF_MAX_STACK_DEPTH 255 + +struct perf_callchain_entry { + __u64 nr; + __u64 ip[PERF_MAX_STACK_DEPTH]; +}; + +struct perf_raw_record { + u32 size; + void *data; +}; + +struct task_struct; + +/** + * struct hw_perf_event - performance event hardware details: + */ +struct hw_perf_event { +#ifdef CONFIG_PERF_EVENTS + union { + struct { /* hardware */ + u64 config; + unsigned long config_base; + unsigned long event_base; + int idx; + }; + union { /* software */ + atomic64_t count; + struct hrtimer hrtimer; + }; + }; + atomic64_t prev_count; + u64 sample_period; + u64 last_period; + atomic64_t period_left; + u64 interrupts; + + u64 freq_count; + u64 freq_interrupts; + u64 freq_stamp; +#endif +}; + +struct perf_event; + +/** + * struct pmu - generic performance monitoring unit + */ +struct pmu { + int (*enable) (struct perf_event *event); + void (*disable) (struct perf_event *event); + void (*read) (struct perf_event *event); + void (*unthrottle) (struct perf_event *event); +}; + +/** + * enum perf_event_active_state - the states of a event + */ +enum perf_event_active_state { + PERF_EVENT_STATE_ERROR = -2, + PERF_EVENT_STATE_OFF = -1, + PERF_EVENT_STATE_INACTIVE = 0, + PERF_EVENT_STATE_ACTIVE = 1, +}; + +struct file; + +struct perf_mmap_data { + struct rcu_head rcu_head; + int nr_pages; /* nr of data pages */ + int writable; /* are we writable */ + int nr_locked; /* nr pages mlocked */ + + atomic_t poll; /* POLL_ for wakeups */ + atomic_t events; /* event_id limit */ + + atomic_long_t head; /* write position */ + atomic_long_t done_head; /* completed head */ + + atomic_t lock; /* concurrent writes */ + atomic_t wakeup; /* needs a wakeup */ + atomic_t lost; /* nr records lost */ + + long watermark; /* wakeup watermark */ + + struct perf_event_mmap_page *user_page; + void *data_pages[0]; +}; + +struct perf_pending_entry { + struct perf_pending_entry *next; + void (*func)(struct perf_pending_entry *); +}; + +/** + * struct perf_event - performance event kernel representation: + */ +struct perf_event { +#ifdef CONFIG_PERF_EVENTS + struct list_head group_entry; + struct list_head event_entry; + struct list_head sibling_list; + int nr_siblings; + struct perf_event *group_leader; + struct perf_event *output; + const struct pmu *pmu; + + enum perf_event_active_state state; + atomic64_t count; + + /* + * These are the total time in nanoseconds that the event + * has been enabled (i.e. eligible to run, and the task has + * been scheduled in, if this is a per-task event) + * and running (scheduled onto the CPU), respectively. + * + * They are computed from tstamp_enabled, tstamp_running and + * tstamp_stopped when the event is in INACTIVE or ACTIVE state. + */ + u64 total_time_enabled; + u64 total_time_running; + + /* + * These are timestamps used for computing total_time_enabled + * and total_time_running when the event is in INACTIVE or + * ACTIVE state, measured in nanoseconds from an arbitrary point + * in time. + * tstamp_enabled: the notional time when the event was enabled + * tstamp_running: the notional time when the event was scheduled on + * tstamp_stopped: in INACTIVE state, the notional time when the + * event was scheduled off. + */ + u64 tstamp_enabled; + u64 tstamp_running; + u64 tstamp_stopped; + + struct perf_event_attr attr; + struct hw_perf_event hw; + + struct perf_event_context *ctx; + struct file *filp; + + /* + * These accumulate total time (in nanoseconds) that children + * events have been enabled and running, respectively. + */ + atomic64_t child_total_time_enabled; + atomic64_t child_total_time_running; + + /* + * Protect attach/detach and child_list: + */ + struct mutex child_mutex; + struct list_head child_list; + struct perf_event *parent; + + int oncpu; + int cpu; + + struct list_head owner_entry; + struct task_struct *owner; + + /* mmap bits */ + struct mutex mmap_mutex; + atomic_t mmap_count; + struct perf_mmap_data *data; + + /* poll related */ + wait_queue_head_t waitq; + struct fasync_struct *fasync; + + /* delayed work for NMIs and such */ + int pending_wakeup; + int pending_kill; + int pending_disable; + struct perf_pending_entry pending; + + atomic_t event_limit; + + void (*destroy)(struct perf_event *); + struct rcu_head rcu_head; + + struct pid_namespace *ns; + u64 id; +#endif +}; + +/** + * struct perf_event_context - event context structure + * + * Used as a container for task events and CPU events as well: + */ +struct perf_event_context { + /* + * Protect the states of the events in the list, + * nr_active, and the list: + */ + spinlock_t lock; + /* + * Protect the list of events. Locking either mutex or lock + * is sufficient to ensure the list doesn't change; to change + * the list you need to lock both the mutex and the spinlock. + */ + struct mutex mutex; + + struct list_head group_list; + struct list_head event_list; + int nr_events; + int nr_active; + int is_active; + int nr_stat; + atomic_t refcount; + struct task_struct *task; + + /* + * Context clock, runs when context enabled. + */ + u64 time; + u64 timestamp; + + /* + * These fields let us detect when two contexts have both + * been cloned (inherited) from a common ancestor. + */ + struct perf_event_context *parent_ctx; + u64 parent_gen; + u64 generation; + int pin_count; + struct rcu_head rcu_head; +}; + +/** + * struct perf_event_cpu_context - per cpu event context structure + */ +struct perf_cpu_context { + struct perf_event_context ctx; + struct perf_event_context *task_ctx; + int active_oncpu; + int max_pertask; + int exclusive; + + /* + * Recursion avoidance: + * + * task, softirq, irq, nmi context + */ + int recursion[4]; +}; + +struct perf_output_handle { + struct perf_event *event; + struct perf_mmap_data *data; + unsigned long head; + unsigned long offset; + int nmi; + int sample; + int locked; + unsigned long flags; +}; + +#ifdef CONFIG_PERF_EVENTS + +/* + * Set by architecture code: + */ +extern int perf_max_events; + +extern const struct pmu *hw_perf_event_init(struct perf_event *event); + +extern void perf_event_task_sched_in(struct task_struct *task, int cpu); +extern void perf_event_task_sched_out(struct task_struct *task, + struct task_struct *next, int cpu); +extern void perf_event_task_tick(struct task_struct *task, int cpu); +extern int perf_event_init_task(struct task_struct *child); +extern void perf_event_exit_task(struct task_struct *child); +extern void perf_event_free_task(struct task_struct *task); +extern void set_perf_event_pending(void); +extern void perf_event_do_pending(void); +extern void perf_event_print_debug(void); +extern void __perf_disable(void); +extern bool __perf_enable(void); +extern void perf_disable(void); +extern void perf_enable(void); +extern int perf_event_task_disable(void); +extern int perf_event_task_enable(void); +extern int hw_perf_group_sched_in(struct perf_event *group_leader, + struct perf_cpu_context *cpuctx, + struct perf_event_context *ctx, int cpu); +extern void perf_event_update_userpage(struct perf_event *event); + +struct perf_sample_data { + u64 type; + + u64 ip; + struct { + u32 pid; + u32 tid; + } tid_entry; + u64 time; + u64 addr; + u64 id; + u64 stream_id; + struct { + u32 cpu; + u32 reserved; + } cpu_entry; + u64 period; + struct perf_callchain_entry *callchain; + struct perf_raw_record *raw; +}; + +extern void perf_output_sample(struct perf_output_handle *handle, + struct perf_event_header *header, + struct perf_sample_data *data, + struct perf_event *event); +extern void perf_prepare_sample(struct perf_event_header *header, + struct perf_sample_data *data, + struct perf_event *event, + struct pt_regs *regs); + +extern int perf_event_overflow(struct perf_event *event, int nmi, + struct perf_sample_data *data, + struct pt_regs *regs); + +/* + * Return 1 for a software event, 0 for a hardware event + */ +static inline int is_software_event(struct perf_event *event) +{ + return (event->attr.type != PERF_TYPE_RAW) && + (event->attr.type != PERF_TYPE_HARDWARE) && + (event->attr.type != PERF_TYPE_HW_CACHE); +} + +extern atomic_t perf_swevent_enabled[PERF_COUNT_SW_MAX]; + +extern void __perf_sw_event(u32, u64, int, struct pt_regs *, u64); + +static inline void +perf_sw_event(u32 event_id, u64 nr, int nmi, struct pt_regs *regs, u64 addr) +{ + if (atomic_read(&perf_swevent_enabled[event_id])) + __perf_sw_event(event_id, nr, nmi, regs, addr); +} + +extern void __perf_event_mmap(struct vm_area_struct *vma); + +static inline void perf_event_mmap(struct vm_area_struct *vma) +{ + if (vma->vm_flags & VM_EXEC) + __perf_event_mmap(vma); +} + +extern void perf_event_comm(struct task_struct *tsk); +extern void perf_event_fork(struct task_struct *tsk); + +extern struct perf_callchain_entry *perf_callchain(struct pt_regs *regs); + +extern int sysctl_perf_event_paranoid; +extern int sysctl_perf_event_mlock; +extern int sysctl_perf_event_sample_rate; + +extern void perf_event_init(void); +extern void perf_tp_event(int event_id, u64 addr, u64 count, + void *record, int entry_size); + +#ifndef perf_misc_flags +#define perf_misc_flags(regs) (user_mode(regs) ? PERF_RECORD_MISC_USER : \ + PERF_RECORD_MISC_KERNEL) +#define perf_instruction_pointer(regs) instruction_pointer(regs) +#endif + +extern int perf_output_begin(struct perf_output_handle *handle, + struct perf_event *event, unsigned int size, + int nmi, int sample); +extern void perf_output_end(struct perf_output_handle *handle); +extern void perf_output_copy(struct perf_output_handle *handle, + const void *buf, unsigned int len); +#else +static inline void +perf_event_task_sched_in(struct task_struct *task, int cpu) { } +static inline void +perf_event_task_sched_out(struct task_struct *task, + struct task_struct *next, int cpu) { } +static inline void +perf_event_task_tick(struct task_struct *task, int cpu) { } +static inline int perf_event_init_task(struct task_struct *child) { return 0; } +static inline void perf_event_exit_task(struct task_struct *child) { } +static inline void perf_event_free_task(struct task_struct *task) { } +static inline void perf_event_do_pending(void) { } +static inline void perf_event_print_debug(void) { } +static inline void perf_disable(void) { } +static inline void perf_enable(void) { } +static inline int perf_event_task_disable(void) { return -EINVAL; } +static inline int perf_event_task_enable(void) { return -EINVAL; } + +static inline void +perf_sw_event(u32 event_id, u64 nr, int nmi, + struct pt_regs *regs, u64 addr) { } + +static inline void perf_event_mmap(struct vm_area_struct *vma) { } +static inline void perf_event_comm(struct task_struct *tsk) { } +static inline void perf_event_fork(struct task_struct *tsk) { } +static inline void perf_event_init(void) { } + +#endif + +#define perf_output_put(handle, x) \ + perf_output_copy((handle), &(x), sizeof(x)) + +#endif /* __KERNEL__ */ +#endif /* _LINUX_PERF_EVENT_H */ diff --git a/include/linux/prctl.h b/include/linux/prctl.h index b00df4c79c63..07bff666e65b 100644 --- a/include/linux/prctl.h +++ b/include/linux/prctl.h @@ -85,7 +85,7 @@ #define PR_SET_TIMERSLACK 29 #define PR_GET_TIMERSLACK 30 -#define PR_TASK_PERF_COUNTERS_DISABLE 31 -#define PR_TASK_PERF_COUNTERS_ENABLE 32 +#define PR_TASK_PERF_EVENTS_DISABLE 31 +#define PR_TASK_PERF_EVENTS_ENABLE 32 #endif /* _LINUX_PRCTL_H */ diff --git a/include/linux/sched.h b/include/linux/sched.h index 8af3d249170e..8b265a8986d0 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -100,7 +100,7 @@ struct robust_list_head; struct bio; struct fs_struct; struct bts_context; -struct perf_counter_context; +struct perf_event_context; /* * List of flags we want to share for kernel threads, @@ -701,7 +701,7 @@ struct user_struct { #endif #endif -#ifdef CONFIG_PERF_COUNTERS +#ifdef CONFIG_PERF_EVENTS atomic_long_t locked_vm; #endif }; @@ -1449,10 +1449,10 @@ struct task_struct { struct list_head pi_state_list; struct futex_pi_state *pi_state_cache; #endif -#ifdef CONFIG_PERF_COUNTERS - struct perf_counter_context *perf_counter_ctxp; - struct mutex perf_counter_mutex; - struct list_head perf_counter_list; +#ifdef CONFIG_PERF_EVENTS + struct perf_event_context *perf_event_ctxp; + struct mutex perf_event_mutex; + struct list_head perf_event_list; #endif #ifdef CONFIG_NUMA struct mempolicy *mempolicy; /* Protected by alloc_lock */ diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index a8e37821cc60..02f19f9a76c6 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -55,7 +55,7 @@ struct compat_timeval; struct robust_list_head; struct getcpu_cache; struct old_linux_dirent; -struct perf_counter_attr; +struct perf_event_attr; #include #include @@ -885,7 +885,7 @@ asmlinkage long sys_ppoll(struct pollfd __user *, unsigned int, int kernel_execve(const char *filename, char *const argv[], char *const envp[]); -asmlinkage long sys_perf_counter_open( - struct perf_counter_attr __user *attr_uptr, +asmlinkage long sys_perf_event_open( + struct perf_event_attr __user *attr_uptr, pid_t pid, int cpu, int group_fd, unsigned long flags); #endif diff --git a/include/trace/ftrace.h b/include/trace/ftrace.h index 72a3b437b829..ec91e78244f0 100644 --- a/include/trace/ftrace.h +++ b/include/trace/ftrace.h @@ -378,7 +378,7 @@ static inline int ftrace_get_offsets_##call( \ #ifdef CONFIG_EVENT_PROFILE /* - * Generate the functions needed for tracepoint perf_counter support. + * Generate the functions needed for tracepoint perf_event support. * * NOTE: The insertion profile callback (ftrace_profile_) is defined later * @@ -656,7 +656,7 @@ __attribute__((section("_ftrace_events"))) event_##call = { \ * { * struct ftrace_data_offsets_ __maybe_unused __data_offsets; * struct ftrace_event_call *event_call = &event_; - * extern void perf_tpcounter_event(int, u64, u64, void *, int); + * extern void perf_tp_event(int, u64, u64, void *, int); * struct ftrace_raw_##call *entry; * u64 __addr = 0, __count = 1; * unsigned long irq_flags; @@ -691,7 +691,7 @@ __attribute__((section("_ftrace_events"))) event_##call = { \ * * <- affect our values * - * perf_tpcounter_event(event_call->id, __addr, __count, entry, + * perf_tp_event(event_call->id, __addr, __count, entry, * __entry_size); <- submit them to perf counter * } while (0); * @@ -712,7 +712,7 @@ static void ftrace_profile_##call(proto) \ { \ struct ftrace_data_offsets_##call __maybe_unused __data_offsets;\ struct ftrace_event_call *event_call = &event_##call; \ - extern void perf_tpcounter_event(int, u64, u64, void *, int); \ + extern void perf_tp_event(int, u64, u64, void *, int); \ struct ftrace_raw_##call *entry; \ u64 __addr = 0, __count = 1; \ unsigned long irq_flags; \ @@ -742,7 +742,7 @@ static void ftrace_profile_##call(proto) \ \ { assign; } \ \ - perf_tpcounter_event(event_call->id, __addr, __count, entry,\ + perf_tp_event(event_call->id, __addr, __count, entry,\ __entry_size); \ } while (0); \ \ diff --git a/init/Kconfig b/init/Kconfig index 8e8b76d8a272..cfdf5c322806 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -915,17 +915,17 @@ config AIO by some high performance threaded applications. Disabling this option saves about 7k. -config HAVE_PERF_COUNTERS +config HAVE_PERF_EVENTS bool help See tools/perf/design.txt for details. menu "Performance Counters" -config PERF_COUNTERS +config PERF_EVENTS bool "Kernel Performance Counters" default y if PROFILING - depends on HAVE_PERF_COUNTERS + depends on HAVE_PERF_EVENTS select ANON_INODES help Enable kernel support for performance counter hardware. @@ -947,7 +947,7 @@ config PERF_COUNTERS config EVENT_PROFILE bool "Tracepoint profiling sources" - depends on PERF_COUNTERS && EVENT_TRACING + depends on PERF_EVENTS && EVENT_TRACING default y help Allow the use of tracepoints as software performance counters. diff --git a/kernel/Makefile b/kernel/Makefile index 3d9c7e27e3f9..e26a546eac44 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -96,7 +96,7 @@ obj-$(CONFIG_X86_DS) += trace/ obj-$(CONFIG_RING_BUFFER) += trace/ obj-$(CONFIG_SMP) += sched_cpupri.o obj-$(CONFIG_SLOW_WORK) += slow-work.o -obj-$(CONFIG_PERF_COUNTERS) += perf_counter.o +obj-$(CONFIG_PERF_EVENTS) += perf_event.o ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y) # According to Alan Modra , the -fno-omit-frame-pointer is diff --git a/kernel/exit.c b/kernel/exit.c index ae5d8660ddff..e47ee8a06135 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -47,7 +47,7 @@ #include #include #include -#include +#include #include #include @@ -154,8 +154,8 @@ static void delayed_put_task_struct(struct rcu_head *rhp) { struct task_struct *tsk = container_of(rhp, struct task_struct, rcu); -#ifdef CONFIG_PERF_COUNTERS - WARN_ON_ONCE(tsk->perf_counter_ctxp); +#ifdef CONFIG_PERF_EVENTS + WARN_ON_ONCE(tsk->perf_event_ctxp); #endif trace_sched_process_free(tsk); put_task_struct(tsk); @@ -981,7 +981,7 @@ NORET_TYPE void do_exit(long code) * Flush inherited counters to the parent - before the parent * gets woken up by child-exit notifications. */ - perf_counter_exit_task(tsk); + perf_event_exit_task(tsk); exit_notify(tsk, group_dead); #ifdef CONFIG_NUMA diff --git a/kernel/fork.c b/kernel/fork.c index bfee931ee3fb..2cebfb23b0b8 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -61,7 +61,7 @@ #include #include #include -#include +#include #include #include @@ -1078,7 +1078,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, /* Perform scheduler related setup. Assign this task to a CPU. */ sched_fork(p, clone_flags); - retval = perf_counter_init_task(p); + retval = perf_event_init_task(p); if (retval) goto bad_fork_cleanup_policy; @@ -1253,7 +1253,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, write_unlock_irq(&tasklist_lock); proc_fork_connector(p); cgroup_post_fork(p); - perf_counter_fork(p); + perf_event_fork(p); return p; bad_fork_free_pid: @@ -1280,7 +1280,7 @@ bad_fork_cleanup_semundo: bad_fork_cleanup_audit: audit_free(p); bad_fork_cleanup_policy: - perf_counter_free_task(p); + perf_event_free_task(p); #ifdef CONFIG_NUMA mpol_put(p->mempolicy); bad_fork_cleanup_cgroup: diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c deleted file mode 100644 index 62de0db8092b..000000000000 --- a/kernel/perf_counter.c +++ /dev/null @@ -1,5000 +0,0 @@ -/* - * Performance counter core code - * - * Copyright (C) 2008 Thomas Gleixner - * Copyright (C) 2008-2009 Red Hat, Inc., Ingo Molnar - * Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra - * Copyright © 2009 Paul Mackerras, IBM Corp. - * - * For licensing details see kernel-base/COPYING - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include - -/* - * Each CPU has a list of per CPU counters: - */ -DEFINE_PER_CPU(struct perf_cpu_context, perf_cpu_context); - -int perf_max_counters __read_mostly = 1; -static int perf_reserved_percpu __read_mostly; -static int perf_overcommit __read_mostly = 1; - -static atomic_t nr_counters __read_mostly; -static atomic_t nr_mmap_counters __read_mostly; -static atomic_t nr_comm_counters __read_mostly; -static atomic_t nr_task_counters __read_mostly; - -/* - * perf counter paranoia level: - * -1 - not paranoid at all - * 0 - disallow raw tracepoint access for unpriv - * 1 - disallow cpu counters for unpriv - * 2 - disallow kernel profiling for unpriv - */ -int sysctl_perf_counter_paranoid __read_mostly = 1; - -static inline bool perf_paranoid_tracepoint_raw(void) -{ - return sysctl_perf_counter_paranoid > -1; -} - -static inline bool perf_paranoid_cpu(void) -{ - return sysctl_perf_counter_paranoid > 0; -} - -static inline bool perf_paranoid_kernel(void) -{ - return sysctl_perf_counter_paranoid > 1; -} - -int sysctl_perf_counter_mlock __read_mostly = 512; /* 'free' kb per user */ - -/* - * max perf counter sample rate - */ -int sysctl_perf_counter_sample_rate __read_mostly = 100000; - -static atomic64_t perf_counter_id; - -/* - * Lock for (sysadmin-configurable) counter reservations: - */ -static DEFINE_SPINLOCK(perf_resource_lock); - -/* - * Architecture provided APIs - weak aliases: - */ -extern __weak const struct pmu *hw_perf_counter_init(struct perf_counter *counter) -{ - return NULL; -} - -void __weak hw_perf_disable(void) { barrier(); } -void __weak hw_perf_enable(void) { barrier(); } - -void __weak hw_perf_counter_setup(int cpu) { barrier(); } -void __weak hw_perf_counter_setup_online(int cpu) { barrier(); } - -int __weak -hw_perf_group_sched_in(struct perf_counter *group_leader, - struct perf_cpu_context *cpuctx, - struct perf_counter_context *ctx, int cpu) -{ - return 0; -} - -void __weak perf_counter_print_debug(void) { } - -static DEFINE_PER_CPU(int, perf_disable_count); - -void __perf_disable(void) -{ - __get_cpu_var(perf_disable_count)++; -} - -bool __perf_enable(void) -{ - return !--__get_cpu_var(perf_disable_count); -} - -void perf_disable(void) -{ - __perf_disable(); - hw_perf_disable(); -} - -void perf_enable(void) -{ - if (__perf_enable()) - hw_perf_enable(); -} - -static void get_ctx(struct perf_counter_context *ctx) -{ - WARN_ON(!atomic_inc_not_zero(&ctx->refcount)); -} - -static void free_ctx(struct rcu_head *head) -{ - struct perf_counter_context *ctx; - - ctx = container_of(head, struct perf_counter_context, rcu_head); - kfree(ctx); -} - -static void put_ctx(struct perf_counter_context *ctx) -{ - if (atomic_dec_and_test(&ctx->refcount)) { - if (ctx->parent_ctx) - put_ctx(ctx->parent_ctx); - if (ctx->task) - put_task_struct(ctx->task); - call_rcu(&ctx->rcu_head, free_ctx); - } -} - -static void unclone_ctx(struct perf_counter_context *ctx) -{ - if (ctx->parent_ctx) { - put_ctx(ctx->parent_ctx); - ctx->parent_ctx = NULL; - } -} - -/* - * If we inherit counters we want to return the parent counter id - * to userspace. - */ -static u64 primary_counter_id(struct perf_counter *counter) -{ - u64 id = counter->id; - - if (counter->parent) - id = counter->parent->id; - - return id; -} - -/* - * Get the perf_counter_context for a task and lock it. - * This has to cope with with the fact that until it is locked, - * the context could get moved to another task. - */ -static struct perf_counter_context * -perf_lock_task_context(struct task_struct *task, unsigned long *flags) -{ - struct perf_counter_context *ctx; - - rcu_read_lock(); - retry: - ctx = rcu_dereference(task->perf_counter_ctxp); - if (ctx) { - /* - * If this context is a clone of another, it might - * get swapped for another underneath us by - * perf_counter_task_sched_out, though the - * rcu_read_lock() protects us from any context - * getting freed. Lock the context and check if it - * got swapped before we could get the lock, and retry - * if so. If we locked the right context, then it - * can't get swapped on us any more. - */ - spin_lock_irqsave(&ctx->lock, *flags); - if (ctx != rcu_dereference(task->perf_counter_ctxp)) { - spin_unlock_irqrestore(&ctx->lock, *flags); - goto retry; - } - - if (!atomic_inc_not_zero(&ctx->refcount)) { - spin_unlock_irqrestore(&ctx->lock, *flags); - ctx = NULL; - } - } - rcu_read_unlock(); - return ctx; -} - -/* - * Get the context for a task and increment its pin_count so it - * can't get swapped to another task. This also increments its - * reference count so that the context can't get freed. - */ -static struct perf_counter_context *perf_pin_task_context(struct task_struct *task) -{ - struct perf_counter_context *ctx; - unsigned long flags; - - ctx = perf_lock_task_context(task, &flags); - if (ctx) { - ++ctx->pin_count; - spin_unlock_irqrestore(&ctx->lock, flags); - } - return ctx; -} - -static void perf_unpin_context(struct perf_counter_context *ctx) -{ - unsigned long flags; - - spin_lock_irqsave(&ctx->lock, flags); - --ctx->pin_count; - spin_unlock_irqrestore(&ctx->lock, flags); - put_ctx(ctx); -} - -/* - * Add a counter from the lists for its context. - * Must be called with ctx->mutex and ctx->lock held. - */ -static void -list_add_counter(struct perf_counter *counter, struct perf_counter_context *ctx) -{ - struct perf_counter *group_leader = counter->group_leader; - - /* - * Depending on whether it is a standalone or sibling counter, - * add it straight to the context's counter list, or to the group - * leader's sibling list: - */ - if (group_leader == counter) - list_add_tail(&counter->group_entry, &ctx->group_list); - else { - list_add_tail(&counter->group_entry, &group_leader->sibling_list); - group_leader->nr_siblings++; - } - - list_add_rcu(&counter->event_entry, &ctx->event_list); - ctx->nr_counters++; - if (counter->attr.inherit_stat) - ctx->nr_stat++; -} - -/* - * Remove a counter from the lists for its context. - * Must be called with ctx->mutex and ctx->lock held. - */ -static void -list_del_counter(struct perf_counter *counter, struct perf_counter_context *ctx) -{ - struct perf_counter *sibling, *tmp; - - if (list_empty(&counter->group_entry)) - return; - ctx->nr_counters--; - if (counter->attr.inherit_stat) - ctx->nr_stat--; - - list_del_init(&counter->group_entry); - list_del_rcu(&counter->event_entry); - - if (counter->group_leader != counter) - counter->group_leader->nr_siblings--; - - /* - * If this was a group counter with sibling counters then - * upgrade the siblings to singleton counters by adding them - * to the context list directly: - */ - list_for_each_entry_safe(sibling, tmp, &counter->sibling_list, group_entry) { - - list_move_tail(&sibling->group_entry, &ctx->group_list); - sibling->group_leader = sibling; - } -} - -static void -counter_sched_out(struct perf_counter *counter, - struct perf_cpu_context *cpuctx, - struct perf_counter_context *ctx) -{ - if (counter->state != PERF_COUNTER_STATE_ACTIVE) - return; - - counter->state = PERF_COUNTER_STATE_INACTIVE; - if (counter->pending_disable) { - counter->pending_disable = 0; - counter->state = PERF_COUNTER_STATE_OFF; - } - counter->tstamp_stopped = ctx->time; - counter->pmu->disable(counter); - counter->oncpu = -1; - - if (!is_software_counter(counter)) - cpuctx->active_oncpu--; - ctx->nr_active--; - if (counter->attr.exclusive || !cpuctx->active_oncpu) - cpuctx->exclusive = 0; -} - -static void -group_sched_out(struct perf_counter *group_counter, - struct perf_cpu_context *cpuctx, - struct perf_counter_context *ctx) -{ - struct perf_counter *counter; - - if (group_counter->state != PERF_COUNTER_STATE_ACTIVE) - return; - - counter_sched_out(group_counter, cpuctx, ctx); - - /* - * Schedule out siblings (if any): - */ - list_for_each_entry(counter, &group_counter->sibling_list, group_entry) - counter_sched_out(counter, cpuctx, ctx); - - if (group_counter->attr.exclusive) - cpuctx->exclusive = 0; -} - -/* - * Cross CPU call to remove a performance counter - * - * We disable the counter on the hardware level first. After that we - * remove it from the context list. - */ -static void __perf_counter_remove_from_context(void *info) -{ - struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); - struct perf_counter *counter = info; - struct perf_counter_context *ctx = counter->ctx; - - /* - * If this is a task context, we need to check whether it is - * the current task context of this cpu. If not it has been - * scheduled out before the smp call arrived. - */ - if (ctx->task && cpuctx->task_ctx != ctx) - return; - - spin_lock(&ctx->lock); - /* - * Protect the list operation against NMI by disabling the - * counters on a global level. - */ - perf_disable(); - - counter_sched_out(counter, cpuctx, ctx); - - list_del_counter(counter, ctx); - - if (!ctx->task) { - /* - * Allow more per task counters with respect to the - * reservation: - */ - cpuctx->max_pertask = - min(perf_max_counters - ctx->nr_counters, - perf_max_counters - perf_reserved_percpu); - } - - perf_enable(); - spin_unlock(&ctx->lock); -} - - -/* - * Remove the counter from a task's (or a CPU's) list of counters. - * - * Must be called with ctx->mutex held. - * - * CPU counters are removed with a smp call. For task counters we only - * call when the task is on a CPU. - * - * If counter->ctx is a cloned context, callers must make sure that - * every task struct that counter->ctx->task could possibly point to - * remains valid. This is OK when called from perf_release since - * that only calls us on the top-level context, which can't be a clone. - * When called from perf_counter_exit_task, it's OK because the - * context has been detached from its task. - */ -static void perf_counter_remove_from_context(struct perf_counter *counter) -{ - struct perf_counter_context *ctx = counter->ctx; - struct task_struct *task = ctx->task; - - if (!task) { - /* - * Per cpu counters are removed via an smp call and - * the removal is always sucessful. - */ - smp_call_function_single(counter->cpu, - __perf_counter_remove_from_context, - counter, 1); - return; - } - -retry: - task_oncpu_function_call(task, __perf_counter_remove_from_context, - counter); - - spin_lock_irq(&ctx->lock); - /* - * If the context is active we need to retry the smp call. - */ - if (ctx->nr_active && !list_empty(&counter->group_entry)) { - spin_unlock_irq(&ctx->lock); - goto retry; - } - - /* - * The lock prevents that this context is scheduled in so we - * can remove the counter safely, if the call above did not - * succeed. - */ - if (!list_empty(&counter->group_entry)) { - list_del_counter(counter, ctx); - } - spin_unlock_irq(&ctx->lock); -} - -static inline u64 perf_clock(void) -{ - return cpu_clock(smp_processor_id()); -} - -/* - * Update the record of the current time in a context. - */ -static void update_context_time(struct perf_counter_context *ctx) -{ - u64 now = perf_clock(); - - ctx->time += now - ctx->timestamp; - ctx->timestamp = now; -} - -/* - * Update the total_time_enabled and total_time_running fields for a counter. - */ -static void update_counter_times(struct perf_counter *counter) -{ - struct perf_counter_context *ctx = counter->ctx; - u64 run_end; - - if (counter->state < PERF_COUNTER_STATE_INACTIVE || - counter->group_leader->state < PERF_COUNTER_STATE_INACTIVE) - return; - - counter->total_time_enabled = ctx->time - counter->tstamp_enabled; - - if (counter->state == PERF_COUNTER_STATE_INACTIVE) - run_end = counter->tstamp_stopped; - else - run_end = ctx->time; - - counter->total_time_running = run_end - counter->tstamp_running; -} - -/* - * Update total_time_enabled and total_time_running for all counters in a group. - */ -static void update_group_times(struct perf_counter *leader) -{ - struct perf_counter *counter; - - update_counter_times(leader); - list_for_each_entry(counter, &leader->sibling_list, group_entry) - update_counter_times(counter); -} - -/* - * Cross CPU call to disable a performance counter - */ -static void __perf_counter_disable(void *info) -{ - struct perf_counter *counter = info; - struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); - struct perf_counter_context *ctx = counter->ctx; - - /* - * If this is a per-task counter, need to check whether this - * counter's task is the current task on this cpu. - */ - if (ctx->task && cpuctx->task_ctx != ctx) - return; - - spin_lock(&ctx->lock); - - /* - * If the counter is on, turn it off. - * If it is in error state, leave it in error state. - */ - if (counter->state >= PERF_COUNTER_STATE_INACTIVE) { - update_context_time(ctx); - update_group_times(counter); - if (counter == counter->group_leader) - group_sched_out(counter, cpuctx, ctx); - else - counter_sched_out(counter, cpuctx, ctx); - counter->state = PERF_COUNTER_STATE_OFF; - } - - spin_unlock(&ctx->lock); -} - -/* - * Disable a counter. - * - * If counter->ctx is a cloned context, callers must make sure that - * every task struct that counter->ctx->task could possibly point to - * remains valid. This condition is satisifed when called through - * perf_counter_for_each_child or perf_counter_for_each because they - * hold the top-level counter's child_mutex, so any descendant that - * goes to exit will block in sync_child_counter. - * When called from perf_pending_counter it's OK because counter->ctx - * is the current context on this CPU and preemption is disabled, - * hence we can't get into perf_counter_task_sched_out for this context. - */ -static void perf_counter_disable(struct perf_counter *counter) -{ - struct perf_counter_context *ctx = counter->ctx; - struct task_struct *task = ctx->task; - - if (!task) { - /* - * Disable the counter on the cpu that it's on - */ - smp_call_function_single(counter->cpu, __perf_counter_disable, - counter, 1); - return; - } - - retry: - task_oncpu_function_call(task, __perf_counter_disable, counter); - - spin_lock_irq(&ctx->lock); - /* - * If the counter is still active, we need to retry the cross-call. - */ - if (counter->state == PERF_COUNTER_STATE_ACTIVE) { - spin_unlock_irq(&ctx->lock); - goto retry; - } - - /* - * Since we have the lock this context can't be scheduled - * in, so we can change the state safely. - */ - if (counter->state == PERF_COUNTER_STATE_INACTIVE) { - update_group_times(counter); - counter->state = PERF_COUNTER_STATE_OFF; - } - - spin_unlock_irq(&ctx->lock); -} - -static int -counter_sched_in(struct perf_counter *counter, - struct perf_cpu_context *cpuctx, - struct perf_counter_context *ctx, - int cpu) -{ - if (counter->state <= PERF_COUNTER_STATE_OFF) - return 0; - - counter->state = PERF_COUNTER_STATE_ACTIVE; - counter->oncpu = cpu; /* TODO: put 'cpu' into cpuctx->cpu */ - /* - * The new state must be visible before we turn it on in the hardware: - */ - smp_wmb(); - - if (counter->pmu->enable(counter)) { - counter->state = PERF_COUNTER_STATE_INACTIVE; - counter->oncpu = -1; - return -EAGAIN; - } - - counter->tstamp_running += ctx->time - counter->tstamp_stopped; - - if (!is_software_counter(counter)) - cpuctx->active_oncpu++; - ctx->nr_active++; - - if (counter->attr.exclusive) - cpuctx->exclusive = 1; - - return 0; -} - -static int -group_sched_in(struct perf_counter *group_counter, - struct perf_cpu_context *cpuctx, - struct perf_counter_context *ctx, - int cpu) -{ - struct perf_counter *counter, *partial_group; - int ret; - - if (group_counter->state == PERF_COUNTER_STATE_OFF) - return 0; - - ret = hw_perf_group_sched_in(group_counter, cpuctx, ctx, cpu); - if (ret) - return ret < 0 ? ret : 0; - - if (counter_sched_in(group_counter, cpuctx, ctx, cpu)) - return -EAGAIN; - - /* - * Schedule in siblings as one group (if any): - */ - list_for_each_entry(counter, &group_counter->sibling_list, group_entry) { - if (counter_sched_in(counter, cpuctx, ctx, cpu)) { - partial_group = counter; - goto group_error; - } - } - - return 0; - -group_error: - /* - * Groups can be scheduled in as one unit only, so undo any - * partial group before returning: - */ - list_for_each_entry(counter, &group_counter->sibling_list, group_entry) { - if (counter == partial_group) - break; - counter_sched_out(counter, cpuctx, ctx); - } - counter_sched_out(group_counter, cpuctx, ctx); - - return -EAGAIN; -} - -/* - * Return 1 for a group consisting entirely of software counters, - * 0 if the group contains any hardware counters. - */ -static int is_software_only_group(struct perf_counter *leader) -{ - struct perf_counter *counter; - - if (!is_software_counter(leader)) - return 0; - - list_for_each_entry(counter, &leader->sibling_list, group_entry) - if (!is_software_counter(counter)) - return 0; - - return 1; -} - -/* - * Work out whether we can put this counter group on the CPU now. - */ -static int group_can_go_on(struct perf_counter *counter, - struct perf_cpu_context *cpuctx, - int can_add_hw) -{ - /* - * Groups consisting entirely of software counters can always go on. - */ - if (is_software_only_group(counter)) - return 1; - /* - * If an exclusive group is already on, no other hardware - * counters can go on. - */ - if (cpuctx->exclusive) - return 0; - /* - * If this group is exclusive and there are already - * counters on the CPU, it can't go on. - */ - if (counter->attr.exclusive && cpuctx->active_oncpu) - return 0; - /* - * Otherwise, try to add it if all previous groups were able - * to go on. - */ - return can_add_hw; -} - -static void add_counter_to_ctx(struct perf_counter *counter, - struct perf_counter_context *ctx) -{ - list_add_counter(counter, ctx); - counter->tstamp_enabled = ctx->time; - counter->tstamp_running = ctx->time; - counter->tstamp_stopped = ctx->time; -} - -/* - * Cross CPU call to install and enable a performance counter - * - * Must be called with ctx->mutex held - */ -static void __perf_install_in_context(void *info) -{ - struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); - struct perf_counter *counter = info; - struct perf_counter_context *ctx = counter->ctx; - struct perf_counter *leader = counter->group_leader; - int cpu = smp_processor_id(); - int err; - - /* - * If this is a task context, we need to check whether it is - * the current task context of this cpu. If not it has been - * scheduled out before the smp call arrived. - * Or possibly this is the right context but it isn't - * on this cpu because it had no counters. - */ - if (ctx->task && cpuctx->task_ctx != ctx) { - if (cpuctx->task_ctx || ctx->task != current) - return; - cpuctx->task_ctx = ctx; - } - - spin_lock(&ctx->lock); - ctx->is_active = 1; - update_context_time(ctx); - - /* - * Protect the list operation against NMI by disabling the - * counters on a global level. NOP for non NMI based counters. - */ - perf_disable(); - - add_counter_to_ctx(counter, ctx); - - /* - * Don't put the counter on if it is disabled or if - * it is in a group and the group isn't on. - */ - if (counter->state != PERF_COUNTER_STATE_INACTIVE || - (leader != counter && leader->state != PERF_COUNTER_STATE_ACTIVE)) - goto unlock; - - /* - * An exclusive counter can't go on if there are already active - * hardware counters, and no hardware counter can go on if there - * is already an exclusive counter on. - */ - if (!group_can_go_on(counter, cpuctx, 1)) - err = -EEXIST; - else - err = counter_sched_in(counter, cpuctx, ctx, cpu); - - if (err) { - /* - * This counter couldn't go on. If it is in a group - * then we have to pull the whole group off. - * If the counter group is pinned then put it in error state. - */ - if (leader != counter) - group_sched_out(leader, cpuctx, ctx); - if (leader->attr.pinned) { - update_group_times(leader); - leader->state = PERF_COUNTER_STATE_ERROR; - } - } - - if (!err && !ctx->task && cpuctx->max_pertask) - cpuctx->max_pertask--; - - unlock: - perf_enable(); - - spin_unlock(&ctx->lock); -} - -/* - * Attach a performance counter to a context - * - * First we add the counter to the list with the hardware enable bit - * in counter->hw_config cleared. - * - * If the counter is attached to a task which is on a CPU we use a smp - * call to enable it in the task context. The task might have been - * scheduled away, but we check this in the smp call again. - * - * Must be called with ctx->mutex held. - */ -static void -perf_install_in_context(struct perf_counter_context *ctx, - struct perf_counter *counter, - int cpu) -{ - struct task_struct *task = ctx->task; - - if (!task) { - /* - * Per cpu counters are installed via an smp call and - * the install is always sucessful. - */ - smp_call_function_single(cpu, __perf_install_in_context, - counter, 1); - return; - } - -retry: - task_oncpu_function_call(task, __perf_install_in_context, - counter); - - spin_lock_irq(&ctx->lock); - /* - * we need to retry the smp call. - */ - if (ctx->is_active && list_empty(&counter->group_entry)) { - spin_unlock_irq(&ctx->lock); - goto retry; - } - - /* - * The lock prevents that this context is scheduled in so we - * can add the counter safely, if it the call above did not - * succeed. - */ - if (list_empty(&counter->group_entry)) - add_counter_to_ctx(counter, ctx); - spin_unlock_irq(&ctx->lock); -} - -/* - * Put a counter into inactive state and update time fields. - * Enabling the leader of a group effectively enables all - * the group members that aren't explicitly disabled, so we - * have to update their ->tstamp_enabled also. - * Note: this works for group members as well as group leaders - * since the non-leader members' sibling_lists will be empty. - */ -static void __perf_counter_mark_enabled(struct perf_counter *counter, - struct perf_counter_context *ctx) -{ - struct perf_counter *sub; - - counter->state = PERF_COUNTER_STATE_INACTIVE; - counter->tstamp_enabled = ctx->time - counter->total_time_enabled; - list_for_each_entry(sub, &counter->sibling_list, group_entry) - if (sub->state >= PERF_COUNTER_STATE_INACTIVE) - sub->tstamp_enabled = - ctx->time - sub->total_time_enabled; -} - -/* - * Cross CPU call to enable a performance counter - */ -static void __perf_counter_enable(void *info) -{ - struct perf_counter *counter = info; - struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); - struct perf_counter_context *ctx = counter->ctx; - struct perf_counter *leader = counter->group_leader; - int err; - - /* - * If this is a per-task counter, need to check whether this - * counter's task is the current task on this cpu. - */ - if (ctx->task && cpuctx->task_ctx != ctx) { - if (cpuctx->task_ctx || ctx->task != current) - return; - cpuctx->task_ctx = ctx; - } - - spin_lock(&ctx->lock); - ctx->is_active = 1; - update_context_time(ctx); - - if (counter->state >= PERF_COUNTER_STATE_INACTIVE) - goto unlock; - __perf_counter_mark_enabled(counter, ctx); - - /* - * If the counter is in a group and isn't the group leader, - * then don't put it on unless the group is on. - */ - if (leader != counter && leader->state != PERF_COUNTER_STATE_ACTIVE) - goto unlock; - - if (!group_can_go_on(counter, cpuctx, 1)) { - err = -EEXIST; - } else { - perf_disable(); - if (counter == leader) - err = group_sched_in(counter, cpuctx, ctx, - smp_processor_id()); - else - err = counter_sched_in(counter, cpuctx, ctx, - smp_processor_id()); - perf_enable(); - } - - if (err) { - /* - * If this counter can't go on and it's part of a - * group, then the whole group has to come off. - */ - if (leader != counter) - group_sched_out(leader, cpuctx, ctx); - if (leader->attr.pinned) { - update_group_times(leader); - leader->state = PERF_COUNTER_STATE_ERROR; - } - } - - unlock: - spin_unlock(&ctx->lock); -} - -/* - * Enable a counter. - * - * If counter->ctx is a cloned context, callers must make sure that - * every task struct that counter->ctx->task could possibly point to - * remains valid. This condition is satisfied when called through - * perf_counter_for_each_child or perf_counter_for_each as described - * for perf_counter_disable. - */ -static void perf_counter_enable(struct perf_counter *counter) -{ - struct perf_counter_context *ctx = counter->ctx; - struct task_struct *task = ctx->task; - - if (!task) { - /* - * Enable the counter on the cpu that it's on - */ - smp_call_function_single(counter->cpu, __perf_counter_enable, - counter, 1); - return; - } - - spin_lock_irq(&ctx->lock); - if (counter->state >= PERF_COUNTER_STATE_INACTIVE) - goto out; - - /* - * If the counter is in error state, clear that first. - * That way, if we see the counter in error state below, we - * know that it has gone back into error state, as distinct - * from the task having been scheduled away before the - * cross-call arrived. - */ - if (counter->state == PERF_COUNTER_STATE_ERROR) - counter->state = PERF_COUNTER_STATE_OFF; - - retry: - spin_unlock_irq(&ctx->lock); - task_oncpu_function_call(task, __perf_counter_enable, counter); - - spin_lock_irq(&ctx->lock); - - /* - * If the context is active and the counter is still off, - * we need to retry the cross-call. - */ - if (ctx->is_active && counter->state == PERF_COUNTER_STATE_OFF) - goto retry; - - /* - * Since we have the lock this context can't be scheduled - * in, so we can change the state safely. - */ - if (counter->state == PERF_COUNTER_STATE_OFF) - __perf_counter_mark_enabled(counter, ctx); - - out: - spin_unlock_irq(&ctx->lock); -} - -static int perf_counter_refresh(struct perf_counter *counter, int refresh) -{ - /* - * not supported on inherited counters - */ - if (counter->attr.inherit) - return -EINVAL; - - atomic_add(refresh, &counter->event_limit); - perf_counter_enable(counter); - - return 0; -} - -void __perf_counter_sched_out(struct perf_counter_context *ctx, - struct perf_cpu_context *cpuctx) -{ - struct perf_counter *counter; - - spin_lock(&ctx->lock); - ctx->is_active = 0; - if (likely(!ctx->nr_counters)) - goto out; - update_context_time(ctx); - - perf_disable(); - if (ctx->nr_active) { - list_for_each_entry(counter, &ctx->group_list, group_entry) { - if (counter != counter->group_leader) - counter_sched_out(counter, cpuctx, ctx); - else - group_sched_out(counter, cpuctx, ctx); - } - } - perf_enable(); - out: - spin_unlock(&ctx->lock); -} - -/* - * Test whether two contexts are equivalent, i.e. whether they - * have both been cloned from the same version of the same context - * and they both have the same number of enabled counters. - * If the number of enabled counters is the same, then the set - * of enabled counters should be the same, because these are both - * inherited contexts, therefore we can't access individual counters - * in them directly with an fd; we can only enable/disable all - * counters via prctl, or enable/disable all counters in a family - * via ioctl, which will have the same effect on both contexts. - */ -static int context_equiv(struct perf_counter_context *ctx1, - struct perf_counter_context *ctx2) -{ - return ctx1->parent_ctx && ctx1->parent_ctx == ctx2->parent_ctx - && ctx1->parent_gen == ctx2->parent_gen - && !ctx1->pin_count && !ctx2->pin_count; -} - -static void __perf_counter_read(void *counter); - -static void __perf_counter_sync_stat(struct perf_counter *counter, - struct perf_counter *next_counter) -{ - u64 value; - - if (!counter->attr.inherit_stat) - return; - - /* - * Update the counter value, we cannot use perf_counter_read() - * because we're in the middle of a context switch and have IRQs - * disabled, which upsets smp_call_function_single(), however - * we know the counter must be on the current CPU, therefore we - * don't need to use it. - */ - switch (counter->state) { - case PERF_COUNTER_STATE_ACTIVE: - __perf_counter_read(counter); - break; - - case PERF_COUNTER_STATE_INACTIVE: - update_counter_times(counter); - break; - - default: - break; - } - - /* - * In order to keep per-task stats reliable we need to flip the counter - * values when we flip the contexts. - */ - value = atomic64_read(&next_counter->count); - value = atomic64_xchg(&counter->count, value); - atomic64_set(&next_counter->count, value); - - swap(counter->total_time_enabled, next_counter->total_time_enabled); - swap(counter->total_time_running, next_counter->total_time_running); - - /* - * Since we swizzled the values, update the user visible data too. - */ - perf_counter_update_userpage(counter); - perf_counter_update_userpage(next_counter); -} - -#define list_next_entry(pos, member) \ - list_entry(pos->member.next, typeof(*pos), member) - -static void perf_counter_sync_stat(struct perf_counter_context *ctx, - struct perf_counter_context *next_ctx) -{ - struct perf_counter *counter, *next_counter; - - if (!ctx->nr_stat) - return; - - counter = list_first_entry(&ctx->event_list, - struct perf_counter, event_entry); - - next_counter = list_first_entry(&next_ctx->event_list, - struct perf_counter, event_entry); - - while (&counter->event_entry != &ctx->event_list && - &next_counter->event_entry != &next_ctx->event_list) { - - __perf_counter_sync_stat(counter, next_counter); - - counter = list_next_entry(counter, event_entry); - next_counter = list_next_entry(next_counter, event_entry); - } -} - -/* - * Called from scheduler to remove the counters of the current task, - * with interrupts disabled. - * - * We stop each counter and update the counter value in counter->count. - * - * This does not protect us against NMI, but disable() - * sets the disabled bit in the control field of counter _before_ - * accessing the counter control register. If a NMI hits, then it will - * not restart the counter. - */ -void perf_counter_task_sched_out(struct task_struct *task, - struct task_struct *next, int cpu) -{ - struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu); - struct perf_counter_context *ctx = task->perf_counter_ctxp; - struct perf_counter_context *next_ctx; - struct perf_counter_context *parent; - struct pt_regs *regs; - int do_switch = 1; - - regs = task_pt_regs(task); - perf_swcounter_event(PERF_COUNT_SW_CONTEXT_SWITCHES, 1, 1, regs, 0); - - if (likely(!ctx || !cpuctx->task_ctx)) - return; - - update_context_time(ctx); - - rcu_read_lock(); - parent = rcu_dereference(ctx->parent_ctx); - next_ctx = next->perf_counter_ctxp; - if (parent && next_ctx && - rcu_dereference(next_ctx->parent_ctx) == parent) { - /* - * Looks like the two contexts are clones, so we might be - * able to optimize the context switch. We lock both - * contexts and check that they are clones under the - * lock (including re-checking that neither has been - * uncloned in the meantime). It doesn't matter which - * order we take the locks because no other cpu could - * be trying to lock both of these tasks. - */ - spin_lock(&ctx->lock); - spin_lock_nested(&next_ctx->lock, SINGLE_DEPTH_NESTING); - if (context_equiv(ctx, next_ctx)) { - /* - * XXX do we need a memory barrier of sorts - * wrt to rcu_dereference() of perf_counter_ctxp - */ - task->perf_counter_ctxp = next_ctx; - next->perf_counter_ctxp = ctx; - ctx->task = next; - next_ctx->task = task; - do_switch = 0; - - perf_counter_sync_stat(ctx, next_ctx); - } - spin_unlock(&next_ctx->lock); - spin_unlock(&ctx->lock); - } - rcu_read_unlock(); - - if (do_switch) { - __perf_counter_sched_out(ctx, cpuctx); - cpuctx->task_ctx = NULL; - } -} - -/* - * Called with IRQs disabled - */ -static void __perf_counter_task_sched_out(struct perf_counter_context *ctx) -{ - struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); - - if (!cpuctx->task_ctx) - return; - - if (WARN_ON_ONCE(ctx != cpuctx->task_ctx)) - return; - - __perf_counter_sched_out(ctx, cpuctx); - cpuctx->task_ctx = NULL; -} - -/* - * Called with IRQs disabled - */ -static void perf_counter_cpu_sched_out(struct perf_cpu_context *cpuctx) -{ - __perf_counter_sched_out(&cpuctx->ctx, cpuctx); -} - -static void -__perf_counter_sched_in(struct perf_counter_context *ctx, - struct perf_cpu_context *cpuctx, int cpu) -{ - struct perf_counter *counter; - int can_add_hw = 1; - - spin_lock(&ctx->lock); - ctx->is_active = 1; - if (likely(!ctx->nr_counters)) - goto out; - - ctx->timestamp = perf_clock(); - - perf_disable(); - - /* - * First go through the list and put on any pinned groups - * in order to give them the best chance of going on. - */ - list_for_each_entry(counter, &ctx->group_list, group_entry) { - if (counter->state <= PERF_COUNTER_STATE_OFF || - !counter->attr.pinned) - continue; - if (counter->cpu != -1 && counter->cpu != cpu) - continue; - - if (counter != counter->group_leader) - counter_sched_in(counter, cpuctx, ctx, cpu); - else { - if (group_can_go_on(counter, cpuctx, 1)) - group_sched_in(counter, cpuctx, ctx, cpu); - } - - /* - * If this pinned group hasn't been scheduled, - * put it in error state. - */ - if (counter->state == PERF_COUNTER_STATE_INACTIVE) { - update_group_times(counter); - counter->state = PERF_COUNTER_STATE_ERROR; - } - } - - list_for_each_entry(counter, &ctx->group_list, group_entry) { - /* - * Ignore counters in OFF or ERROR state, and - * ignore pinned counters since we did them already. - */ - if (counter->state <= PERF_COUNTER_STATE_OFF || - counter->attr.pinned) - continue; - - /* - * Listen to the 'cpu' scheduling filter constraint - * of counters: - */ - if (counter->cpu != -1 && counter->cpu != cpu) - continue; - - if (counter != counter->group_leader) { - if (counter_sched_in(counter, cpuctx, ctx, cpu)) - can_add_hw = 0; - } else { - if (group_can_go_on(counter, cpuctx, can_add_hw)) { - if (group_sched_in(counter, cpuctx, ctx, cpu)) - can_add_hw = 0; - } - } - } - perf_enable(); - out: - spin_unlock(&ctx->lock); -} - -/* - * Called from scheduler to add the counters of the current task - * with interrupts disabled. - * - * We restore the counter value and then enable it. - * - * This does not protect us against NMI, but enable() - * sets the enabled bit in the control field of counter _before_ - * accessing the counter control register. If a NMI hits, then it will - * keep the counter running. - */ -void perf_counter_task_sched_in(struct task_struct *task, int cpu) -{ - struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu); - struct perf_counter_context *ctx = task->perf_counter_ctxp; - - if (likely(!ctx)) - return; - if (cpuctx->task_ctx == ctx) - return; - __perf_counter_sched_in(ctx, cpuctx, cpu); - cpuctx->task_ctx = ctx; -} - -static void perf_counter_cpu_sched_in(struct perf_cpu_context *cpuctx, int cpu) -{ - struct perf_counter_context *ctx = &cpuctx->ctx; - - __perf_counter_sched_in(ctx, cpuctx, cpu); -} - -#define MAX_INTERRUPTS (~0ULL) - -static void perf_log_throttle(struct perf_counter *counter, int enable); - -static void perf_adjust_period(struct perf_counter *counter, u64 events) -{ - struct hw_perf_counter *hwc = &counter->hw; - u64 period, sample_period; - s64 delta; - - events *= hwc->sample_period; - period = div64_u64(events, counter->attr.sample_freq); - - delta = (s64)(period - hwc->sample_period); - delta = (delta + 7) / 8; /* low pass filter */ - - sample_period = hwc->sample_period + delta; - - if (!sample_period) - sample_period = 1; - - hwc->sample_period = sample_period; -} - -static void perf_ctx_adjust_freq(struct perf_counter_context *ctx) -{ - struct perf_counter *counter; - struct hw_perf_counter *hwc; - u64 interrupts, freq; - - spin_lock(&ctx->lock); - list_for_each_entry(counter, &ctx->group_list, group_entry) { - if (counter->state != PERF_COUNTER_STATE_ACTIVE) - continue; - - hwc = &counter->hw; - - interrupts = hwc->interrupts; - hwc->interrupts = 0; - - /* - * unthrottle counters on the tick - */ - if (interrupts == MAX_INTERRUPTS) { - perf_log_throttle(counter, 1); - counter->pmu->unthrottle(counter); - interrupts = 2*sysctl_perf_counter_sample_rate/HZ; - } - - if (!counter->attr.freq || !counter->attr.sample_freq) - continue; - - /* - * if the specified freq < HZ then we need to skip ticks - */ - if (counter->attr.sample_freq < HZ) { - freq = counter->attr.sample_freq; - - hwc->freq_count += freq; - hwc->freq_interrupts += interrupts; - - if (hwc->freq_count < HZ) - continue; - - interrupts = hwc->freq_interrupts; - hwc->freq_interrupts = 0; - hwc->freq_count -= HZ; - } else - freq = HZ; - - perf_adjust_period(counter, freq * interrupts); - - /* - * In order to avoid being stalled by an (accidental) huge - * sample period, force reset the sample period if we didn't - * get any events in this freq period. - */ - if (!interrupts) { - perf_disable(); - counter->pmu->disable(counter); - atomic64_set(&hwc->period_left, 0); - counter->pmu->enable(counter); - perf_enable(); - } - } - spin_unlock(&ctx->lock); -} - -/* - * Round-robin a context's counters: - */ -static void rotate_ctx(struct perf_counter_context *ctx) -{ - struct perf_counter *counter; - - if (!ctx->nr_counters) - return; - - spin_lock(&ctx->lock); - /* - * Rotate the first entry last (works just fine for group counters too): - */ - perf_disable(); - list_for_each_entry(counter, &ctx->group_list, group_entry) { - list_move_tail(&counter->group_entry, &ctx->group_list); - break; - } - perf_enable(); - - spin_unlock(&ctx->lock); -} - -void perf_counter_task_tick(struct task_struct *curr, int cpu) -{ - struct perf_cpu_context *cpuctx; - struct perf_counter_context *ctx; - - if (!atomic_read(&nr_counters)) - return; - - cpuctx = &per_cpu(perf_cpu_context, cpu); - ctx = curr->perf_counter_ctxp; - - perf_ctx_adjust_freq(&cpuctx->ctx); - if (ctx) - perf_ctx_adjust_freq(ctx); - - perf_counter_cpu_sched_out(cpuctx); - if (ctx) - __perf_counter_task_sched_out(ctx); - - rotate_ctx(&cpuctx->ctx); - if (ctx) - rotate_ctx(ctx); - - perf_counter_cpu_sched_in(cpuctx, cpu); - if (ctx) - perf_counter_task_sched_in(curr, cpu); -} - -/* - * Enable all of a task's counters that have been marked enable-on-exec. - * This expects task == current. - */ -static void perf_counter_enable_on_exec(struct task_struct *task) -{ - struct perf_counter_context *ctx; - struct perf_counter *counter; - unsigned long flags; - int enabled = 0; - - local_irq_save(flags); - ctx = task->perf_counter_ctxp; - if (!ctx || !ctx->nr_counters) - goto out; - - __perf_counter_task_sched_out(ctx); - - spin_lock(&ctx->lock); - - list_for_each_entry(counter, &ctx->group_list, group_entry) { - if (!counter->attr.enable_on_exec) - continue; - counter->attr.enable_on_exec = 0; - if (counter->state >= PERF_COUNTER_STATE_INACTIVE) - continue; - __perf_counter_mark_enabled(counter, ctx); - enabled = 1; - } - - /* - * Unclone this context if we enabled any counter. - */ - if (enabled) - unclone_ctx(ctx); - - spin_unlock(&ctx->lock); - - perf_counter_task_sched_in(task, smp_processor_id()); - out: - local_irq_restore(flags); -} - -/* - * Cross CPU call to read the hardware counter - */ -static void __perf_counter_read(void *info) -{ - struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); - struct perf_counter *counter = info; - struct perf_counter_context *ctx = counter->ctx; - unsigned long flags; - - /* - * If this is a task context, we need to check whether it is - * the current task context of this cpu. If not it has been - * scheduled out before the smp call arrived. In that case - * counter->count would have been updated to a recent sample - * when the counter was scheduled out. - */ - if (ctx->task && cpuctx->task_ctx != ctx) - return; - - local_irq_save(flags); - if (ctx->is_active) - update_context_time(ctx); - counter->pmu->read(counter); - update_counter_times(counter); - local_irq_restore(flags); -} - -static u64 perf_counter_read(struct perf_counter *counter) -{ - /* - * If counter is enabled and currently active on a CPU, update the - * value in the counter structure: - */ - if (counter->state == PERF_COUNTER_STATE_ACTIVE) { - smp_call_function_single(counter->oncpu, - __perf_counter_read, counter, 1); - } else if (counter->state == PERF_COUNTER_STATE_INACTIVE) { - update_counter_times(counter); - } - - return atomic64_read(&counter->count); -} - -/* - * Initialize the perf_counter context in a task_struct: - */ -static void -__perf_counter_init_context(struct perf_counter_context *ctx, - struct task_struct *task) -{ - memset(ctx, 0, sizeof(*ctx)); - spin_lock_init(&ctx->lock); - mutex_init(&ctx->mutex); - INIT_LIST_HEAD(&ctx->group_list); - INIT_LIST_HEAD(&ctx->event_list); - atomic_set(&ctx->refcount, 1); - ctx->task = task; -} - -static struct perf_counter_context *find_get_context(pid_t pid, int cpu) -{ - struct perf_counter_context *ctx; - struct perf_cpu_context *cpuctx; - struct task_struct *task; - unsigned long flags; - int err; - - /* - * If cpu is not a wildcard then this is a percpu counter: - */ - if (cpu != -1) { - /* Must be root to operate on a CPU counter: */ - if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN)) - return ERR_PTR(-EACCES); - - if (cpu < 0 || cpu > num_possible_cpus()) - return ERR_PTR(-EINVAL); - - /* - * We could be clever and allow to attach a counter to an - * offline CPU and activate it when the CPU comes up, but - * that's for later. - */ - if (!cpu_isset(cpu, cpu_online_map)) - return ERR_PTR(-ENODEV); - - cpuctx = &per_cpu(perf_cpu_context, cpu); - ctx = &cpuctx->ctx; - get_ctx(ctx); - - return ctx; - } - - rcu_read_lock(); - if (!pid) - task = current; - else - task = find_task_by_vpid(pid); - if (task) - get_task_struct(task); - rcu_read_unlock(); - - if (!task) - return ERR_PTR(-ESRCH); - - /* - * Can't attach counters to a dying task. - */ - err = -ESRCH; - if (task->flags & PF_EXITING) - goto errout; - - /* Reuse ptrace permission checks for now. */ - err = -EACCES; - if (!ptrace_may_access(task, PTRACE_MODE_READ)) - goto errout; - - retry: - ctx = perf_lock_task_context(task, &flags); - if (ctx) { - unclone_ctx(ctx); - spin_unlock_irqrestore(&ctx->lock, flags); - } - - if (!ctx) { - ctx = kmalloc(sizeof(struct perf_counter_context), GFP_KERNEL); - err = -ENOMEM; - if (!ctx) - goto errout; - __perf_counter_init_context(ctx, task); - get_ctx(ctx); - if (cmpxchg(&task->perf_counter_ctxp, NULL, ctx)) { - /* - * We raced with some other task; use - * the context they set. - */ - kfree(ctx); - goto retry; - } - get_task_struct(task); - } - - put_task_struct(task); - return ctx; - - errout: - put_task_struct(task); - return ERR_PTR(err); -} - -static void free_counter_rcu(struct rcu_head *head) -{ - struct perf_counter *counter; - - counter = container_of(head, struct perf_counter, rcu_head); - if (counter->ns) - put_pid_ns(counter->ns); - kfree(counter); -} - -static void perf_pending_sync(struct perf_counter *counter); - -static void free_counter(struct perf_counter *counter) -{ - perf_pending_sync(counter); - - if (!counter->parent) { - atomic_dec(&nr_counters); - if (counter->attr.mmap) - atomic_dec(&nr_mmap_counters); - if (counter->attr.comm) - atomic_dec(&nr_comm_counters); - if (counter->attr.task) - atomic_dec(&nr_task_counters); - } - - if (counter->output) { - fput(counter->output->filp); - counter->output = NULL; - } - - if (counter->destroy) - counter->destroy(counter); - - put_ctx(counter->ctx); - call_rcu(&counter->rcu_head, free_counter_rcu); -} - -/* - * Called when the last reference to the file is gone. - */ -static int perf_release(struct inode *inode, struct file *file) -{ - struct perf_counter *counter = file->private_data; - struct perf_counter_context *ctx = counter->ctx; - - file->private_data = NULL; - - WARN_ON_ONCE(ctx->parent_ctx); - mutex_lock(&ctx->mutex); - perf_counter_remove_from_context(counter); - mutex_unlock(&ctx->mutex); - - mutex_lock(&counter->owner->perf_counter_mutex); - list_del_init(&counter->owner_entry); - mutex_unlock(&counter->owner->perf_counter_mutex); - put_task_struct(counter->owner); - - free_counter(counter); - - return 0; -} - -static int perf_counter_read_size(struct perf_counter *counter) -{ - int entry = sizeof(u64); /* value */ - int size = 0; - int nr = 1; - - if (counter->attr.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) - size += sizeof(u64); - - if (counter->attr.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) - size += sizeof(u64); - - if (counter->attr.read_format & PERF_FORMAT_ID) - entry += sizeof(u64); - - if (counter->attr.read_format & PERF_FORMAT_GROUP) { - nr += counter->group_leader->nr_siblings; - size += sizeof(u64); - } - - size += entry * nr; - - return size; -} - -static u64 perf_counter_read_value(struct perf_counter *counter) -{ - struct perf_counter *child; - u64 total = 0; - - total += perf_counter_read(counter); - list_for_each_entry(child, &counter->child_list, child_list) - total += perf_counter_read(child); - - return total; -} - -static int perf_counter_read_entry(struct perf_counter *counter, - u64 read_format, char __user *buf) -{ - int n = 0, count = 0; - u64 values[2]; - - values[n++] = perf_counter_read_value(counter); - if (read_format & PERF_FORMAT_ID) - values[n++] = primary_counter_id(counter); - - count = n * sizeof(u64); - - if (copy_to_user(buf, values, count)) - return -EFAULT; - - return count; -} - -static int perf_counter_read_group(struct perf_counter *counter, - u64 read_format, char __user *buf) -{ - struct perf_counter *leader = counter->group_leader, *sub; - int n = 0, size = 0, err = -EFAULT; - u64 values[3]; - - values[n++] = 1 + leader->nr_siblings; - if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) { - values[n++] = leader->total_time_enabled + - atomic64_read(&leader->child_total_time_enabled); - } - if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) { - values[n++] = leader->total_time_running + - atomic64_read(&leader->child_total_time_running); - } - - size = n * sizeof(u64); - - if (copy_to_user(buf, values, size)) - return -EFAULT; - - err = perf_counter_read_entry(leader, read_format, buf + size); - if (err < 0) - return err; - - size += err; - - list_for_each_entry(sub, &leader->sibling_list, group_entry) { - err = perf_counter_read_entry(sub, read_format, - buf + size); - if (err < 0) - return err; - - size += err; - } - - return size; -} - -static int perf_counter_read_one(struct perf_counter *counter, - u64 read_format, char __user *buf) -{ - u64 values[4]; - int n = 0; - - values[n++] = perf_counter_read_value(counter); - if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) { - values[n++] = counter->total_time_enabled + - atomic64_read(&counter->child_total_time_enabled); - } - if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) { - values[n++] = counter->total_time_running + - atomic64_read(&counter->child_total_time_running); - } - if (read_format & PERF_FORMAT_ID) - values[n++] = primary_counter_id(counter); - - if (copy_to_user(buf, values, n * sizeof(u64))) - return -EFAULT; - - return n * sizeof(u64); -} - -/* - * Read the performance counter - simple non blocking version for now - */ -static ssize_t -perf_read_hw(struct perf_counter *counter, char __user *buf, size_t count) -{ - u64 read_format = counter->attr.read_format; - int ret; - - /* - * Return end-of-file for a read on a counter that is in - * error state (i.e. because it was pinned but it couldn't be - * scheduled on to the CPU at some point). - */ - if (counter->state == PERF_COUNTER_STATE_ERROR) - return 0; - - if (count < perf_counter_read_size(counter)) - return -ENOSPC; - - WARN_ON_ONCE(counter->ctx->parent_ctx); - mutex_lock(&counter->child_mutex); - if (read_format & PERF_FORMAT_GROUP) - ret = perf_counter_read_group(counter, read_format, buf); - else - ret = perf_counter_read_one(counter, read_format, buf); - mutex_unlock(&counter->child_mutex); - - return ret; -} - -static ssize_t -perf_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) -{ - struct perf_counter *counter = file->private_data; - - return perf_read_hw(counter, buf, count); -} - -static unsigned int perf_poll(struct file *file, poll_table *wait) -{ - struct perf_counter *counter = file->private_data; - struct perf_mmap_data *data; - unsigned int events = POLL_HUP; - - rcu_read_lock(); - data = rcu_dereference(counter->data); - if (data) - events = atomic_xchg(&data->poll, 0); - rcu_read_unlock(); - - poll_wait(file, &counter->waitq, wait); - - return events; -} - -static void perf_counter_reset(struct perf_counter *counter) -{ - (void)perf_counter_read(counter); - atomic64_set(&counter->count, 0); - perf_counter_update_userpage(counter); -} - -/* - * Holding the top-level counter's child_mutex means that any - * descendant process that has inherited this counter will block - * in sync_child_counter if it goes to exit, thus satisfying the - * task existence requirements of perf_counter_enable/disable. - */ -static void perf_counter_for_each_child(struct perf_counter *counter, - void (*func)(struct perf_counter *)) -{ - struct perf_counter *child; - - WARN_ON_ONCE(counter->ctx->parent_ctx); - mutex_lock(&counter->child_mutex); - func(counter); - list_for_each_entry(child, &counter->child_list, child_list) - func(child); - mutex_unlock(&counter->child_mutex); -} - -static void perf_counter_for_each(struct perf_counter *counter, - void (*func)(struct perf_counter *)) -{ - struct perf_counter_context *ctx = counter->ctx; - struct perf_counter *sibling; - - WARN_ON_ONCE(ctx->parent_ctx); - mutex_lock(&ctx->mutex); - counter = counter->group_leader; - - perf_counter_for_each_child(counter, func); - func(counter); - list_for_each_entry(sibling, &counter->sibling_list, group_entry) - perf_counter_for_each_child(counter, func); - mutex_unlock(&ctx->mutex); -} - -static int perf_counter_period(struct perf_counter *counter, u64 __user *arg) -{ - struct perf_counter_context *ctx = counter->ctx; - unsigned long size; - int ret = 0; - u64 value; - - if (!counter->attr.sample_period) - return -EINVAL; - - size = copy_from_user(&value, arg, sizeof(value)); - if (size != sizeof(value)) - return -EFAULT; - - if (!value) - return -EINVAL; - - spin_lock_irq(&ctx->lock); - if (counter->attr.freq) { - if (value > sysctl_perf_counter_sample_rate) { - ret = -EINVAL; - goto unlock; - } - - counter->attr.sample_freq = value; - } else { - counter->attr.sample_period = value; - counter->hw.sample_period = value; - } -unlock: - spin_unlock_irq(&ctx->lock); - - return ret; -} - -int perf_counter_set_output(struct perf_counter *counter, int output_fd); - -static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg) -{ - struct perf_counter *counter = file->private_data; - void (*func)(struct perf_counter *); - u32 flags = arg; - - switch (cmd) { - case PERF_COUNTER_IOC_ENABLE: - func = perf_counter_enable; - break; - case PERF_COUNTER_IOC_DISABLE: - func = perf_counter_disable; - break; - case PERF_COUNTER_IOC_RESET: - func = perf_counter_reset; - break; - - case PERF_COUNTER_IOC_REFRESH: - return perf_counter_refresh(counter, arg); - - case PERF_COUNTER_IOC_PERIOD: - return perf_counter_period(counter, (u64 __user *)arg); - - case PERF_COUNTER_IOC_SET_OUTPUT: - return perf_counter_set_output(counter, arg); - - default: - return -ENOTTY; - } - - if (flags & PERF_IOC_FLAG_GROUP) - perf_counter_for_each(counter, func); - else - perf_counter_for_each_child(counter, func); - - return 0; -} - -int perf_counter_task_enable(void) -{ - struct perf_counter *counter; - - mutex_lock(¤t->perf_counter_mutex); - list_for_each_entry(counter, ¤t->perf_counter_list, owner_entry) - perf_counter_for_each_child(counter, perf_counter_enable); - mutex_unlock(¤t->perf_counter_mutex); - - return 0; -} - -int perf_counter_task_disable(void) -{ - struct perf_counter *counter; - - mutex_lock(¤t->perf_counter_mutex); - list_for_each_entry(counter, ¤t->perf_counter_list, owner_entry) - perf_counter_for_each_child(counter, perf_counter_disable); - mutex_unlock(¤t->perf_counter_mutex); - - return 0; -} - -#ifndef PERF_COUNTER_INDEX_OFFSET -# define PERF_COUNTER_INDEX_OFFSET 0 -#endif - -static int perf_counter_index(struct perf_counter *counter) -{ - if (counter->state != PERF_COUNTER_STATE_ACTIVE) - return 0; - - return counter->hw.idx + 1 - PERF_COUNTER_INDEX_OFFSET; -} - -/* - * Callers need to ensure there can be no nesting of this function, otherwise - * the seqlock logic goes bad. We can not serialize this because the arch - * code calls this from NMI context. - */ -void perf_counter_update_userpage(struct perf_counter *counter) -{ - struct perf_counter_mmap_page *userpg; - struct perf_mmap_data *data; - - rcu_read_lock(); - data = rcu_dereference(counter->data); - if (!data) - goto unlock; - - userpg = data->user_page; - - /* - * Disable preemption so as to not let the corresponding user-space - * spin too long if we get preempted. - */ - preempt_disable(); - ++userpg->lock; - barrier(); - userpg->index = perf_counter_index(counter); - userpg->offset = atomic64_read(&counter->count); - if (counter->state == PERF_COUNTER_STATE_ACTIVE) - userpg->offset -= atomic64_read(&counter->hw.prev_count); - - userpg->time_enabled = counter->total_time_enabled + - atomic64_read(&counter->child_total_time_enabled); - - userpg->time_running = counter->total_time_running + - atomic64_read(&counter->child_total_time_running); - - barrier(); - ++userpg->lock; - preempt_enable(); -unlock: - rcu_read_unlock(); -} - -static int perf_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) -{ - struct perf_counter *counter = vma->vm_file->private_data; - struct perf_mmap_data *data; - int ret = VM_FAULT_SIGBUS; - - if (vmf->flags & FAULT_FLAG_MKWRITE) { - if (vmf->pgoff == 0) - ret = 0; - return ret; - } - - rcu_read_lock(); - data = rcu_dereference(counter->data); - if (!data) - goto unlock; - - if (vmf->pgoff == 0) { - vmf->page = virt_to_page(data->user_page); - } else { - int nr = vmf->pgoff - 1; - - if ((unsigned)nr > data->nr_pages) - goto unlock; - - if (vmf->flags & FAULT_FLAG_WRITE) - goto unlock; - - vmf->page = virt_to_page(data->data_pages[nr]); - } - - get_page(vmf->page); - vmf->page->mapping = vma->vm_file->f_mapping; - vmf->page->index = vmf->pgoff; - - ret = 0; -unlock: - rcu_read_unlock(); - - return ret; -} - -static int perf_mmap_data_alloc(struct perf_counter *counter, int nr_pages) -{ - struct perf_mmap_data *data; - unsigned long size; - int i; - - WARN_ON(atomic_read(&counter->mmap_count)); - - size = sizeof(struct perf_mmap_data); - size += nr_pages * sizeof(void *); - - data = kzalloc(size, GFP_KERNEL); - if (!data) - goto fail; - - data->user_page = (void *)get_zeroed_page(GFP_KERNEL); - if (!data->user_page) - goto fail_user_page; - - for (i = 0; i < nr_pages; i++) { - data->data_pages[i] = (void *)get_zeroed_page(GFP_KERNEL); - if (!data->data_pages[i]) - goto fail_data_pages; - } - - data->nr_pages = nr_pages; - atomic_set(&data->lock, -1); - - if (counter->attr.watermark) { - data->watermark = min_t(long, PAGE_SIZE * nr_pages, - counter->attr.wakeup_watermark); - } - if (!data->watermark) - data->watermark = max(PAGE_SIZE, PAGE_SIZE * nr_pages / 4); - - rcu_assign_pointer(counter->data, data); - - return 0; - -fail_data_pages: - for (i--; i >= 0; i--) - free_page((unsigned long)data->data_pages[i]); - - free_page((unsigned long)data->user_page); - -fail_user_page: - kfree(data); - -fail: - return -ENOMEM; -} - -static void perf_mmap_free_page(unsigned long addr) -{ - struct page *page = virt_to_page((void *)addr); - - page->mapping = NULL; - __free_page(page); -} - -static void __perf_mmap_data_free(struct rcu_head *rcu_head) -{ - struct perf_mmap_data *data; - int i; - - data = container_of(rcu_head, struct perf_mmap_data, rcu_head); - - perf_mmap_free_page((unsigned long)data->user_page); - for (i = 0; i < data->nr_pages; i++) - perf_mmap_free_page((unsigned long)data->data_pages[i]); - - kfree(data); -} - -static void perf_mmap_data_free(struct perf_counter *counter) -{ - struct perf_mmap_data *data = counter->data; - - WARN_ON(atomic_read(&counter->mmap_count)); - - rcu_assign_pointer(counter->data, NULL); - call_rcu(&data->rcu_head, __perf_mmap_data_free); -} - -static void perf_mmap_open(struct vm_area_struct *vma) -{ - struct perf_counter *counter = vma->vm_file->private_data; - - atomic_inc(&counter->mmap_count); -} - -static void perf_mmap_close(struct vm_area_struct *vma) -{ - struct perf_counter *counter = vma->vm_file->private_data; - - WARN_ON_ONCE(counter->ctx->parent_ctx); - if (atomic_dec_and_mutex_lock(&counter->mmap_count, &counter->mmap_mutex)) { - struct user_struct *user = current_user(); - - atomic_long_sub(counter->data->nr_pages + 1, &user->locked_vm); - vma->vm_mm->locked_vm -= counter->data->nr_locked; - perf_mmap_data_free(counter); - mutex_unlock(&counter->mmap_mutex); - } -} - -static struct vm_operations_struct perf_mmap_vmops = { - .open = perf_mmap_open, - .close = perf_mmap_close, - .fault = perf_mmap_fault, - .page_mkwrite = perf_mmap_fault, -}; - -static int perf_mmap(struct file *file, struct vm_area_struct *vma) -{ - struct perf_counter *counter = file->private_data; - unsigned long user_locked, user_lock_limit; - struct user_struct *user = current_user(); - unsigned long locked, lock_limit; - unsigned long vma_size; - unsigned long nr_pages; - long user_extra, extra; - int ret = 0; - - if (!(vma->vm_flags & VM_SHARED)) - return -EINVAL; - - vma_size = vma->vm_end - vma->vm_start; - nr_pages = (vma_size / PAGE_SIZE) - 1; - - /* - * If we have data pages ensure they're a power-of-two number, so we - * can do bitmasks instead of modulo. - */ - if (nr_pages != 0 && !is_power_of_2(nr_pages)) - return -EINVAL; - - if (vma_size != PAGE_SIZE * (1 + nr_pages)) - return -EINVAL; - - if (vma->vm_pgoff != 0) - return -EINVAL; - - WARN_ON_ONCE(counter->ctx->parent_ctx); - mutex_lock(&counter->mmap_mutex); - if (counter->output) { - ret = -EINVAL; - goto unlock; - } - - if (atomic_inc_not_zero(&counter->mmap_count)) { - if (nr_pages != counter->data->nr_pages) - ret = -EINVAL; - goto unlock; - } - - user_extra = nr_pages + 1; - user_lock_limit = sysctl_perf_counter_mlock >> (PAGE_SHIFT - 10); - - /* - * Increase the limit linearly with more CPUs: - */ - user_lock_limit *= num_online_cpus(); - - user_locked = atomic_long_read(&user->locked_vm) + user_extra; - - extra = 0; - if (user_locked > user_lock_limit) - extra = user_locked - user_lock_limit; - - lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur; - lock_limit >>= PAGE_SHIFT; - locked = vma->vm_mm->locked_vm + extra; - - if ((locked > lock_limit) && perf_paranoid_tracepoint_raw() && - !capable(CAP_IPC_LOCK)) { - ret = -EPERM; - goto unlock; - } - - WARN_ON(counter->data); - ret = perf_mmap_data_alloc(counter, nr_pages); - if (ret) - goto unlock; - - atomic_set(&counter->mmap_count, 1); - atomic_long_add(user_extra, &user->locked_vm); - vma->vm_mm->locked_vm += extra; - counter->data->nr_locked = extra; - if (vma->vm_flags & VM_WRITE) - counter->data->writable = 1; - -unlock: - mutex_unlock(&counter->mmap_mutex); - - vma->vm_flags |= VM_RESERVED; - vma->vm_ops = &perf_mmap_vmops; - - return ret; -} - -static int perf_fasync(int fd, struct file *filp, int on) -{ - struct inode *inode = filp->f_path.dentry->d_inode; - struct perf_counter *counter = filp->private_data; - int retval; - - mutex_lock(&inode->i_mutex); - retval = fasync_helper(fd, filp, on, &counter->fasync); - mutex_unlock(&inode->i_mutex); - - if (retval < 0) - return retval; - - return 0; -} - -static const struct file_operations perf_fops = { - .release = perf_release, - .read = perf_read, - .poll = perf_poll, - .unlocked_ioctl = perf_ioctl, - .compat_ioctl = perf_ioctl, - .mmap = perf_mmap, - .fasync = perf_fasync, -}; - -/* - * Perf counter wakeup - * - * If there's data, ensure we set the poll() state and publish everything - * to user-space before waking everybody up. - */ - -void perf_counter_wakeup(struct perf_counter *counter) -{ - wake_up_all(&counter->waitq); - - if (counter->pending_kill) { - kill_fasync(&counter->fasync, SIGIO, counter->pending_kill); - counter->pending_kill = 0; - } -} - -/* - * Pending wakeups - * - * Handle the case where we need to wakeup up from NMI (or rq->lock) context. - * - * The NMI bit means we cannot possibly take locks. Therefore, maintain a - * single linked list and use cmpxchg() to add entries lockless. - */ - -static void perf_pending_counter(struct perf_pending_entry *entry) -{ - struct perf_counter *counter = container_of(entry, - struct perf_counter, pending); - - if (counter->pending_disable) { - counter->pending_disable = 0; - __perf_counter_disable(counter); - } - - if (counter->pending_wakeup) { - counter->pending_wakeup = 0; - perf_counter_wakeup(counter); - } -} - -#define PENDING_TAIL ((struct perf_pending_entry *)-1UL) - -static DEFINE_PER_CPU(struct perf_pending_entry *, perf_pending_head) = { - PENDING_TAIL, -}; - -static void perf_pending_queue(struct perf_pending_entry *entry, - void (*func)(struct perf_pending_entry *)) -{ - struct perf_pending_entry **head; - - if (cmpxchg(&entry->next, NULL, PENDING_TAIL) != NULL) - return; - - entry->func = func; - - head = &get_cpu_var(perf_pending_head); - - do { - entry->next = *head; - } while (cmpxchg(head, entry->next, entry) != entry->next); - - set_perf_counter_pending(); - - put_cpu_var(perf_pending_head); -} - -static int __perf_pending_run(void) -{ - struct perf_pending_entry *list; - int nr = 0; - - list = xchg(&__get_cpu_var(perf_pending_head), PENDING_TAIL); - while (list != PENDING_TAIL) { - void (*func)(struct perf_pending_entry *); - struct perf_pending_entry *entry = list; - - list = list->next; - - func = entry->func; - entry->next = NULL; - /* - * Ensure we observe the unqueue before we issue the wakeup, - * so that we won't be waiting forever. - * -- see perf_not_pending(). - */ - smp_wmb(); - - func(entry); - nr++; - } - - return nr; -} - -static inline int perf_not_pending(struct perf_counter *counter) -{ - /* - * If we flush on whatever cpu we run, there is a chance we don't - * need to wait. - */ - get_cpu(); - __perf_pending_run(); - put_cpu(); - - /* - * Ensure we see the proper queue state before going to sleep - * so that we do not miss the wakeup. -- see perf_pending_handle() - */ - smp_rmb(); - return counter->pending.next == NULL; -} - -static void perf_pending_sync(struct perf_counter *counter) -{ - wait_event(counter->waitq, perf_not_pending(counter)); -} - -void perf_counter_do_pending(void) -{ - __perf_pending_run(); -} - -/* - * Callchain support -- arch specific - */ - -__weak struct perf_callchain_entry *perf_callchain(struct pt_regs *regs) -{ - return NULL; -} - -/* - * Output - */ -static bool perf_output_space(struct perf_mmap_data *data, unsigned long tail, - unsigned long offset, unsigned long head) -{ - unsigned long mask; - - if (!data->writable) - return true; - - mask = (data->nr_pages << PAGE_SHIFT) - 1; - - offset = (offset - tail) & mask; - head = (head - tail) & mask; - - if ((int)(head - offset) < 0) - return false; - - return true; -} - -static void perf_output_wakeup(struct perf_output_handle *handle) -{ - atomic_set(&handle->data->poll, POLL_IN); - - if (handle->nmi) { - handle->counter->pending_wakeup = 1; - perf_pending_queue(&handle->counter->pending, - perf_pending_counter); - } else - perf_counter_wakeup(handle->counter); -} - -/* - * Curious locking construct. - * - * We need to ensure a later event doesn't publish a head when a former - * event isn't done writing. However since we need to deal with NMIs we - * cannot fully serialize things. - * - * What we do is serialize between CPUs so we only have to deal with NMI - * nesting on a single CPU. - * - * We only publish the head (and generate a wakeup) when the outer-most - * event completes. - */ -static void perf_output_lock(struct perf_output_handle *handle) -{ - struct perf_mmap_data *data = handle->data; - int cpu; - - handle->locked = 0; - - local_irq_save(handle->flags); - cpu = smp_processor_id(); - - if (in_nmi() && atomic_read(&data->lock) == cpu) - return; - - while (atomic_cmpxchg(&data->lock, -1, cpu) != -1) - cpu_relax(); - - handle->locked = 1; -} - -static void perf_output_unlock(struct perf_output_handle *handle) -{ - struct perf_mmap_data *data = handle->data; - unsigned long head; - int cpu; - - data->done_head = data->head; - - if (!handle->locked) - goto out; - -again: - /* - * The xchg implies a full barrier that ensures all writes are done - * before we publish the new head, matched by a rmb() in userspace when - * reading this position. - */ - while ((head = atomic_long_xchg(&data->done_head, 0))) - data->user_page->data_head = head; - - /* - * NMI can happen here, which means we can miss a done_head update. - */ - - cpu = atomic_xchg(&data->lock, -1); - WARN_ON_ONCE(cpu != smp_processor_id()); - - /* - * Therefore we have to validate we did not indeed do so. - */ - if (unlikely(atomic_long_read(&data->done_head))) { - /* - * Since we had it locked, we can lock it again. - */ - while (atomic_cmpxchg(&data->lock, -1, cpu) != -1) - cpu_relax(); - - goto again; - } - - if (atomic_xchg(&data->wakeup, 0)) - perf_output_wakeup(handle); -out: - local_irq_restore(handle->flags); -} - -void perf_output_copy(struct perf_output_handle *handle, - const void *buf, unsigned int len) -{ - unsigned int pages_mask; - unsigned int offset; - unsigned int size; - void **pages; - - offset = handle->offset; - pages_mask = handle->data->nr_pages - 1; - pages = handle->data->data_pages; - - do { - unsigned int page_offset; - int nr; - - nr = (offset >> PAGE_SHIFT) & pages_mask; - page_offset = offset & (PAGE_SIZE - 1); - size = min_t(unsigned int, PAGE_SIZE - page_offset, len); - - memcpy(pages[nr] + page_offset, buf, size); - - len -= size; - buf += size; - offset += size; - } while (len); - - handle->offset = offset; - - /* - * Check we didn't copy past our reservation window, taking the - * possible unsigned int wrap into account. - */ - WARN_ON_ONCE(((long)(handle->head - handle->offset)) < 0); -} - -int perf_output_begin(struct perf_output_handle *handle, - struct perf_counter *counter, unsigned int size, - int nmi, int sample) -{ - struct perf_counter *output_counter; - struct perf_mmap_data *data; - unsigned long tail, offset, head; - int have_lost; - struct { - struct perf_event_header header; - u64 id; - u64 lost; - } lost_event; - - rcu_read_lock(); - /* - * For inherited counters we send all the output towards the parent. - */ - if (counter->parent) - counter = counter->parent; - - output_counter = rcu_dereference(counter->output); - if (output_counter) - counter = output_counter; - - data = rcu_dereference(counter->data); - if (!data) - goto out; - - handle->data = data; - handle->counter = counter; - handle->nmi = nmi; - handle->sample = sample; - - if (!data->nr_pages) - goto fail; - - have_lost = atomic_read(&data->lost); - if (have_lost) - size += sizeof(lost_event); - - perf_output_lock(handle); - - do { - /* - * Userspace could choose to issue a mb() before updating the - * tail pointer. So that all reads will be completed before the - * write is issued. - */ - tail = ACCESS_ONCE(data->user_page->data_tail); - smp_rmb(); - offset = head = atomic_long_read(&data->head); - head += size; - if (unlikely(!perf_output_space(data, tail, offset, head))) - goto fail; - } while (atomic_long_cmpxchg(&data->head, offset, head) != offset); - - handle->offset = offset; - handle->head = head; - - if (head - tail > data->watermark) - atomic_set(&data->wakeup, 1); - - if (have_lost) { - lost_event.header.type = PERF_EVENT_LOST; - lost_event.header.misc = 0; - lost_event.header.size = sizeof(lost_event); - lost_event.id = counter->id; - lost_event.lost = atomic_xchg(&data->lost, 0); - - perf_output_put(handle, lost_event); - } - - return 0; - -fail: - atomic_inc(&data->lost); - perf_output_unlock(handle); -out: - rcu_read_unlock(); - - return -ENOSPC; -} - -void perf_output_end(struct perf_output_handle *handle) -{ - struct perf_counter *counter = handle->counter; - struct perf_mmap_data *data = handle->data; - - int wakeup_events = counter->attr.wakeup_events; - - if (handle->sample && wakeup_events) { - int events = atomic_inc_return(&data->events); - if (events >= wakeup_events) { - atomic_sub(wakeup_events, &data->events); - atomic_set(&data->wakeup, 1); - } - } - - perf_output_unlock(handle); - rcu_read_unlock(); -} - -static u32 perf_counter_pid(struct perf_counter *counter, struct task_struct *p) -{ - /* - * only top level counters have the pid namespace they were created in - */ - if (counter->parent) - counter = counter->parent; - - return task_tgid_nr_ns(p, counter->ns); -} - -static u32 perf_counter_tid(struct perf_counter *counter, struct task_struct *p) -{ - /* - * only top level counters have the pid namespace they were created in - */ - if (counter->parent) - counter = counter->parent; - - return task_pid_nr_ns(p, counter->ns); -} - -static void perf_output_read_one(struct perf_output_handle *handle, - struct perf_counter *counter) -{ - u64 read_format = counter->attr.read_format; - u64 values[4]; - int n = 0; - - values[n++] = atomic64_read(&counter->count); - if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) { - values[n++] = counter->total_time_enabled + - atomic64_read(&counter->child_total_time_enabled); - } - if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) { - values[n++] = counter->total_time_running + - atomic64_read(&counter->child_total_time_running); - } - if (read_format & PERF_FORMAT_ID) - values[n++] = primary_counter_id(counter); - - perf_output_copy(handle, values, n * sizeof(u64)); -} - -/* - * XXX PERF_FORMAT_GROUP vs inherited counters seems difficult. - */ -static void perf_output_read_group(struct perf_output_handle *handle, - struct perf_counter *counter) -{ - struct perf_counter *leader = counter->group_leader, *sub; - u64 read_format = counter->attr.read_format; - u64 values[5]; - int n = 0; - - values[n++] = 1 + leader->nr_siblings; - - if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) - values[n++] = leader->total_time_enabled; - - if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) - values[n++] = leader->total_time_running; - - if (leader != counter) - leader->pmu->read(leader); - - values[n++] = atomic64_read(&leader->count); - if (read_format & PERF_FORMAT_ID) - values[n++] = primary_counter_id(leader); - - perf_output_copy(handle, values, n * sizeof(u64)); - - list_for_each_entry(sub, &leader->sibling_list, group_entry) { - n = 0; - - if (sub != counter) - sub->pmu->read(sub); - - values[n++] = atomic64_read(&sub->count); - if (read_format & PERF_FORMAT_ID) - values[n++] = primary_counter_id(sub); - - perf_output_copy(handle, values, n * sizeof(u64)); - } -} - -static void perf_output_read(struct perf_output_handle *handle, - struct perf_counter *counter) -{ - if (counter->attr.read_format & PERF_FORMAT_GROUP) - perf_output_read_group(handle, counter); - else - perf_output_read_one(handle, counter); -} - -void perf_output_sample(struct perf_output_handle *handle, - struct perf_event_header *header, - struct perf_sample_data *data, - struct perf_counter *counter) -{ - u64 sample_type = data->type; - - perf_output_put(handle, *header); - - if (sample_type & PERF_SAMPLE_IP) - perf_output_put(handle, data->ip); - - if (sample_type & PERF_SAMPLE_TID) - perf_output_put(handle, data->tid_entry); - - if (sample_type & PERF_SAMPLE_TIME) - perf_output_put(handle, data->time); - - if (sample_type & PERF_SAMPLE_ADDR) - perf_output_put(handle, data->addr); - - if (sample_type & PERF_SAMPLE_ID) - perf_output_put(handle, data->id); - - if (sample_type & PERF_SAMPLE_STREAM_ID) - perf_output_put(handle, data->stream_id); - - if (sample_type & PERF_SAMPLE_CPU) - perf_output_put(handle, data->cpu_entry); - - if (sample_type & PERF_SAMPLE_PERIOD) - perf_output_put(handle, data->period); - - if (sample_type & PERF_SAMPLE_READ) - perf_output_read(handle, counter); - - if (sample_type & PERF_SAMPLE_CALLCHAIN) { - if (data->callchain) { - int size = 1; - - if (data->callchain) - size += data->callchain->nr; - - size *= sizeof(u64); - - perf_output_copy(handle, data->callchain, size); - } else { - u64 nr = 0; - perf_output_put(handle, nr); - } - } - - if (sample_type & PERF_SAMPLE_RAW) { - if (data->raw) { - perf_output_put(handle, data->raw->size); - perf_output_copy(handle, data->raw->data, - data->raw->size); - } else { - struct { - u32 size; - u32 data; - } raw = { - .size = sizeof(u32), - .data = 0, - }; - perf_output_put(handle, raw); - } - } -} - -void perf_prepare_sample(struct perf_event_header *header, - struct perf_sample_data *data, - struct perf_counter *counter, - struct pt_regs *regs) -{ - u64 sample_type = counter->attr.sample_type; - - data->type = sample_type; - - header->type = PERF_EVENT_SAMPLE; - header->size = sizeof(*header); - - header->misc = 0; - header->misc |= perf_misc_flags(regs); - - if (sample_type & PERF_SAMPLE_IP) { - data->ip = perf_instruction_pointer(regs); - - header->size += sizeof(data->ip); - } - - if (sample_type & PERF_SAMPLE_TID) { - /* namespace issues */ - data->tid_entry.pid = perf_counter_pid(counter, current); - data->tid_entry.tid = perf_counter_tid(counter, current); - - header->size += sizeof(data->tid_entry); - } - - if (sample_type & PERF_SAMPLE_TIME) { - data->time = perf_clock(); - - header->size += sizeof(data->time); - } - - if (sample_type & PERF_SAMPLE_ADDR) - header->size += sizeof(data->addr); - - if (sample_type & PERF_SAMPLE_ID) { - data->id = primary_counter_id(counter); - - header->size += sizeof(data->id); - } - - if (sample_type & PERF_SAMPLE_STREAM_ID) { - data->stream_id = counter->id; - - header->size += sizeof(data->stream_id); - } - - if (sample_type & PERF_SAMPLE_CPU) { - data->cpu_entry.cpu = raw_smp_processor_id(); - data->cpu_entry.reserved = 0; - - header->size += sizeof(data->cpu_entry); - } - - if (sample_type & PERF_SAMPLE_PERIOD) - header->size += sizeof(data->period); - - if (sample_type & PERF_SAMPLE_READ) - header->size += perf_counter_read_size(counter); - - if (sample_type & PERF_SAMPLE_CALLCHAIN) { - int size = 1; - - data->callchain = perf_callchain(regs); - - if (data->callchain) - size += data->callchain->nr; - - header->size += size * sizeof(u64); - } - - if (sample_type & PERF_SAMPLE_RAW) { - int size = sizeof(u32); - - if (data->raw) - size += data->raw->size; - else - size += sizeof(u32); - - WARN_ON_ONCE(size & (sizeof(u64)-1)); - header->size += size; - } -} - -static void perf_counter_output(struct perf_counter *counter, int nmi, - struct perf_sample_data *data, - struct pt_regs *regs) -{ - struct perf_output_handle handle; - struct perf_event_header header; - - perf_prepare_sample(&header, data, counter, regs); - - if (perf_output_begin(&handle, counter, header.size, nmi, 1)) - return; - - perf_output_sample(&handle, &header, data, counter); - - perf_output_end(&handle); -} - -/* - * read event - */ - -struct perf_read_event { - struct perf_event_header header; - - u32 pid; - u32 tid; -}; - -static void -perf_counter_read_event(struct perf_counter *counter, - struct task_struct *task) -{ - struct perf_output_handle handle; - struct perf_read_event read_event = { - .header = { - .type = PERF_EVENT_READ, - .misc = 0, - .size = sizeof(read_event) + perf_counter_read_size(counter), - }, - .pid = perf_counter_pid(counter, task), - .tid = perf_counter_tid(counter, task), - }; - int ret; - - ret = perf_output_begin(&handle, counter, read_event.header.size, 0, 0); - if (ret) - return; - - perf_output_put(&handle, read_event); - perf_output_read(&handle, counter); - - perf_output_end(&handle); -} - -/* - * task tracking -- fork/exit - * - * enabled by: attr.comm | attr.mmap | attr.task - */ - -struct perf_task_event { - struct task_struct *task; - struct perf_counter_context *task_ctx; - - struct { - struct perf_event_header header; - - u32 pid; - u32 ppid; - u32 tid; - u32 ptid; - u64 time; - } event; -}; - -static void perf_counter_task_output(struct perf_counter *counter, - struct perf_task_event *task_event) -{ - struct perf_output_handle handle; - int size; - struct task_struct *task = task_event->task; - int ret; - - size = task_event->event.header.size; - ret = perf_output_begin(&handle, counter, size, 0, 0); - - if (ret) - return; - - task_event->event.pid = perf_counter_pid(counter, task); - task_event->event.ppid = perf_counter_pid(counter, current); - - task_event->event.tid = perf_counter_tid(counter, task); - task_event->event.ptid = perf_counter_tid(counter, current); - - task_event->event.time = perf_clock(); - - perf_output_put(&handle, task_event->event); - - perf_output_end(&handle); -} - -static int perf_counter_task_match(struct perf_counter *counter) -{ - if (counter->attr.comm || counter->attr.mmap || counter->attr.task) - return 1; - - return 0; -} - -static void perf_counter_task_ctx(struct perf_counter_context *ctx, - struct perf_task_event *task_event) -{ - struct perf_counter *counter; - - if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list)) - return; - - rcu_read_lock(); - list_for_each_entry_rcu(counter, &ctx->event_list, event_entry) { - if (perf_counter_task_match(counter)) - perf_counter_task_output(counter, task_event); - } - rcu_read_unlock(); -} - -static void perf_counter_task_event(struct perf_task_event *task_event) -{ - struct perf_cpu_context *cpuctx; - struct perf_counter_context *ctx = task_event->task_ctx; - - cpuctx = &get_cpu_var(perf_cpu_context); - perf_counter_task_ctx(&cpuctx->ctx, task_event); - put_cpu_var(perf_cpu_context); - - rcu_read_lock(); - if (!ctx) - ctx = rcu_dereference(task_event->task->perf_counter_ctxp); - if (ctx) - perf_counter_task_ctx(ctx, task_event); - rcu_read_unlock(); -} - -static void perf_counter_task(struct task_struct *task, - struct perf_counter_context *task_ctx, - int new) -{ - struct perf_task_event task_event; - - if (!atomic_read(&nr_comm_counters) && - !atomic_read(&nr_mmap_counters) && - !atomic_read(&nr_task_counters)) - return; - - task_event = (struct perf_task_event){ - .task = task, - .task_ctx = task_ctx, - .event = { - .header = { - .type = new ? PERF_EVENT_FORK : PERF_EVENT_EXIT, - .misc = 0, - .size = sizeof(task_event.event), - }, - /* .pid */ - /* .ppid */ - /* .tid */ - /* .ptid */ - }, - }; - - perf_counter_task_event(&task_event); -} - -void perf_counter_fork(struct task_struct *task) -{ - perf_counter_task(task, NULL, 1); -} - -/* - * comm tracking - */ - -struct perf_comm_event { - struct task_struct *task; - char *comm; - int comm_size; - - struct { - struct perf_event_header header; - - u32 pid; - u32 tid; - } event; -}; - -static void perf_counter_comm_output(struct perf_counter *counter, - struct perf_comm_event *comm_event) -{ - struct perf_output_handle handle; - int size = comm_event->event.header.size; - int ret = perf_output_begin(&handle, counter, size, 0, 0); - - if (ret) - return; - - comm_event->event.pid = perf_counter_pid(counter, comm_event->task); - comm_event->event.tid = perf_counter_tid(counter, comm_event->task); - - perf_output_put(&handle, comm_event->event); - perf_output_copy(&handle, comm_event->comm, - comm_event->comm_size); - perf_output_end(&handle); -} - -static int perf_counter_comm_match(struct perf_counter *counter) -{ - if (counter->attr.comm) - return 1; - - return 0; -} - -static void perf_counter_comm_ctx(struct perf_counter_context *ctx, - struct perf_comm_event *comm_event) -{ - struct perf_counter *counter; - - if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list)) - return; - - rcu_read_lock(); - list_for_each_entry_rcu(counter, &ctx->event_list, event_entry) { - if (perf_counter_comm_match(counter)) - perf_counter_comm_output(counter, comm_event); - } - rcu_read_unlock(); -} - -static void perf_counter_comm_event(struct perf_comm_event *comm_event) -{ - struct perf_cpu_context *cpuctx; - struct perf_counter_context *ctx; - unsigned int size; - char comm[TASK_COMM_LEN]; - - memset(comm, 0, sizeof(comm)); - strncpy(comm, comm_event->task->comm, sizeof(comm)); - size = ALIGN(strlen(comm)+1, sizeof(u64)); - - comm_event->comm = comm; - comm_event->comm_size = size; - - comm_event->event.header.size = sizeof(comm_event->event) + size; - - cpuctx = &get_cpu_var(perf_cpu_context); - perf_counter_comm_ctx(&cpuctx->ctx, comm_event); - put_cpu_var(perf_cpu_context); - - rcu_read_lock(); - /* - * doesn't really matter which of the child contexts the - * events ends up in. - */ - ctx = rcu_dereference(current->perf_counter_ctxp); - if (ctx) - perf_counter_comm_ctx(ctx, comm_event); - rcu_read_unlock(); -} - -void perf_counter_comm(struct task_struct *task) -{ - struct perf_comm_event comm_event; - - if (task->perf_counter_ctxp) - perf_counter_enable_on_exec(task); - - if (!atomic_read(&nr_comm_counters)) - return; - - comm_event = (struct perf_comm_event){ - .task = task, - /* .comm */ - /* .comm_size */ - .event = { - .header = { - .type = PERF_EVENT_COMM, - .misc = 0, - /* .size */ - }, - /* .pid */ - /* .tid */ - }, - }; - - perf_counter_comm_event(&comm_event); -} - -/* - * mmap tracking - */ - -struct perf_mmap_event { - struct vm_area_struct *vma; - - const char *file_name; - int file_size; - - struct { - struct perf_event_header header; - - u32 pid; - u32 tid; - u64 start; - u64 len; - u64 pgoff; - } event; -}; - -static void perf_counter_mmap_output(struct perf_counter *counter, - struct perf_mmap_event *mmap_event) -{ - struct perf_output_handle handle; - int size = mmap_event->event.header.size; - int ret = perf_output_begin(&handle, counter, size, 0, 0); - - if (ret) - return; - - mmap_event->event.pid = perf_counter_pid(counter, current); - mmap_event->event.tid = perf_counter_tid(counter, current); - - perf_output_put(&handle, mmap_event->event); - perf_output_copy(&handle, mmap_event->file_name, - mmap_event->file_size); - perf_output_end(&handle); -} - -static int perf_counter_mmap_match(struct perf_counter *counter, - struct perf_mmap_event *mmap_event) -{ - if (counter->attr.mmap) - return 1; - - return 0; -} - -static void perf_counter_mmap_ctx(struct perf_counter_context *ctx, - struct perf_mmap_event *mmap_event) -{ - struct perf_counter *counter; - - if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list)) - return; - - rcu_read_lock(); - list_for_each_entry_rcu(counter, &ctx->event_list, event_entry) { - if (perf_counter_mmap_match(counter, mmap_event)) - perf_counter_mmap_output(counter, mmap_event); - } - rcu_read_unlock(); -} - -static void perf_counter_mmap_event(struct perf_mmap_event *mmap_event) -{ - struct perf_cpu_context *cpuctx; - struct perf_counter_context *ctx; - struct vm_area_struct *vma = mmap_event->vma; - struct file *file = vma->vm_file; - unsigned int size; - char tmp[16]; - char *buf = NULL; - const char *name; - - memset(tmp, 0, sizeof(tmp)); - - if (file) { - /* - * d_path works from the end of the buffer backwards, so we - * need to add enough zero bytes after the string to handle - * the 64bit alignment we do later. - */ - buf = kzalloc(PATH_MAX + sizeof(u64), GFP_KERNEL); - if (!buf) { - name = strncpy(tmp, "//enomem", sizeof(tmp)); - goto got_name; - } - name = d_path(&file->f_path, buf, PATH_MAX); - if (IS_ERR(name)) { - name = strncpy(tmp, "//toolong", sizeof(tmp)); - goto got_name; - } - } else { - if (arch_vma_name(mmap_event->vma)) { - name = strncpy(tmp, arch_vma_name(mmap_event->vma), - sizeof(tmp)); - goto got_name; - } - - if (!vma->vm_mm) { - name = strncpy(tmp, "[vdso]", sizeof(tmp)); - goto got_name; - } - - name = strncpy(tmp, "//anon", sizeof(tmp)); - goto got_name; - } - -got_name: - size = ALIGN(strlen(name)+1, sizeof(u64)); - - mmap_event->file_name = name; - mmap_event->file_size = size; - - mmap_event->event.header.size = sizeof(mmap_event->event) + size; - - cpuctx = &get_cpu_var(perf_cpu_context); - perf_counter_mmap_ctx(&cpuctx->ctx, mmap_event); - put_cpu_var(perf_cpu_context); - - rcu_read_lock(); - /* - * doesn't really matter which of the child contexts the - * events ends up in. - */ - ctx = rcu_dereference(current->perf_counter_ctxp); - if (ctx) - perf_counter_mmap_ctx(ctx, mmap_event); - rcu_read_unlock(); - - kfree(buf); -} - -void __perf_counter_mmap(struct vm_area_struct *vma) -{ - struct perf_mmap_event mmap_event; - - if (!atomic_read(&nr_mmap_counters)) - return; - - mmap_event = (struct perf_mmap_event){ - .vma = vma, - /* .file_name */ - /* .file_size */ - .event = { - .header = { - .type = PERF_EVENT_MMAP, - .misc = 0, - /* .size */ - }, - /* .pid */ - /* .tid */ - .start = vma->vm_start, - .len = vma->vm_end - vma->vm_start, - .pgoff = vma->vm_pgoff, - }, - }; - - perf_counter_mmap_event(&mmap_event); -} - -/* - * IRQ throttle logging - */ - -static void perf_log_throttle(struct perf_counter *counter, int enable) -{ - struct perf_output_handle handle; - int ret; - - struct { - struct perf_event_header header; - u64 time; - u64 id; - u64 stream_id; - } throttle_event = { - .header = { - .type = PERF_EVENT_THROTTLE, - .misc = 0, - .size = sizeof(throttle_event), - }, - .time = perf_clock(), - .id = primary_counter_id(counter), - .stream_id = counter->id, - }; - - if (enable) - throttle_event.header.type = PERF_EVENT_UNTHROTTLE; - - ret = perf_output_begin(&handle, counter, sizeof(throttle_event), 1, 0); - if (ret) - return; - - perf_output_put(&handle, throttle_event); - perf_output_end(&handle); -} - -/* - * Generic counter overflow handling, sampling. - */ - -static int __perf_counter_overflow(struct perf_counter *counter, int nmi, - int throttle, struct perf_sample_data *data, - struct pt_regs *regs) -{ - int events = atomic_read(&counter->event_limit); - struct hw_perf_counter *hwc = &counter->hw; - int ret = 0; - - throttle = (throttle && counter->pmu->unthrottle != NULL); - - if (!throttle) { - hwc->interrupts++; - } else { - if (hwc->interrupts != MAX_INTERRUPTS) { - hwc->interrupts++; - if (HZ * hwc->interrupts > - (u64)sysctl_perf_counter_sample_rate) { - hwc->interrupts = MAX_INTERRUPTS; - perf_log_throttle(counter, 0); - ret = 1; - } - } else { - /* - * Keep re-disabling counters even though on the previous - * pass we disabled it - just in case we raced with a - * sched-in and the counter got enabled again: - */ - ret = 1; - } - } - - if (counter->attr.freq) { - u64 now = perf_clock(); - s64 delta = now - hwc->freq_stamp; - - hwc->freq_stamp = now; - - if (delta > 0 && delta < TICK_NSEC) - perf_adjust_period(counter, NSEC_PER_SEC / (int)delta); - } - - /* - * XXX event_limit might not quite work as expected on inherited - * counters - */ - - counter->pending_kill = POLL_IN; - if (events && atomic_dec_and_test(&counter->event_limit)) { - ret = 1; - counter->pending_kill = POLL_HUP; - if (nmi) { - counter->pending_disable = 1; - perf_pending_queue(&counter->pending, - perf_pending_counter); - } else - perf_counter_disable(counter); - } - - perf_counter_output(counter, nmi, data, regs); - return ret; -} - -int perf_counter_overflow(struct perf_counter *counter, int nmi, - struct perf_sample_data *data, - struct pt_regs *regs) -{ - return __perf_counter_overflow(counter, nmi, 1, data, regs); -} - -/* - * Generic software counter infrastructure - */ - -/* - * We directly increment counter->count and keep a second value in - * counter->hw.period_left to count intervals. This period counter - * is kept in the range [-sample_period, 0] so that we can use the - * sign as trigger. - */ - -static u64 perf_swcounter_set_period(struct perf_counter *counter) -{ - struct hw_perf_counter *hwc = &counter->hw; - u64 period = hwc->last_period; - u64 nr, offset; - s64 old, val; - - hwc->last_period = hwc->sample_period; - -again: - old = val = atomic64_read(&hwc->period_left); - if (val < 0) - return 0; - - nr = div64_u64(period + val, period); - offset = nr * period; - val -= offset; - if (atomic64_cmpxchg(&hwc->period_left, old, val) != old) - goto again; - - return nr; -} - -static void perf_swcounter_overflow(struct perf_counter *counter, - int nmi, struct perf_sample_data *data, - struct pt_regs *regs) -{ - struct hw_perf_counter *hwc = &counter->hw; - int throttle = 0; - u64 overflow; - - data->period = counter->hw.last_period; - overflow = perf_swcounter_set_period(counter); - - if (hwc->interrupts == MAX_INTERRUPTS) - return; - - for (; overflow; overflow--) { - if (__perf_counter_overflow(counter, nmi, throttle, - data, regs)) { - /* - * We inhibit the overflow from happening when - * hwc->interrupts == MAX_INTERRUPTS. - */ - break; - } - throttle = 1; - } -} - -static void perf_swcounter_unthrottle(struct perf_counter *counter) -{ - /* - * Nothing to do, we already reset hwc->interrupts. - */ -} - -static void perf_swcounter_add(struct perf_counter *counter, u64 nr, - int nmi, struct perf_sample_data *data, - struct pt_regs *regs) -{ - struct hw_perf_counter *hwc = &counter->hw; - - atomic64_add(nr, &counter->count); - - if (!hwc->sample_period) - return; - - if (!regs) - return; - - if (!atomic64_add_negative(nr, &hwc->period_left)) - perf_swcounter_overflow(counter, nmi, data, regs); -} - -static int perf_swcounter_is_counting(struct perf_counter *counter) -{ - /* - * The counter is active, we're good! - */ - if (counter->state == PERF_COUNTER_STATE_ACTIVE) - return 1; - - /* - * The counter is off/error, not counting. - */ - if (counter->state != PERF_COUNTER_STATE_INACTIVE) - return 0; - - /* - * The counter is inactive, if the context is active - * we're part of a group that didn't make it on the 'pmu', - * not counting. - */ - if (counter->ctx->is_active) - return 0; - - /* - * We're inactive and the context is too, this means the - * task is scheduled out, we're counting events that happen - * to us, like migration events. - */ - return 1; -} - -static int perf_swcounter_match(struct perf_counter *counter, - enum perf_type_id type, - u32 event_id, struct pt_regs *regs) -{ - if (!perf_swcounter_is_counting(counter)) - return 0; - - if (counter->attr.type != type) - return 0; - if (counter->attr.config != event_id) - return 0; - - if (regs) { - if (counter->attr.exclude_user && user_mode(regs)) - return 0; - - if (counter->attr.exclude_kernel && !user_mode(regs)) - return 0; - } - - return 1; -} - -static void perf_swcounter_ctx_event(struct perf_counter_context *ctx, - enum perf_type_id type, - u32 event_id, u64 nr, int nmi, - struct perf_sample_data *data, - struct pt_regs *regs) -{ - struct perf_counter *counter; - - if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list)) - return; - - rcu_read_lock(); - list_for_each_entry_rcu(counter, &ctx->event_list, event_entry) { - if (perf_swcounter_match(counter, type, event_id, regs)) - perf_swcounter_add(counter, nr, nmi, data, regs); - } - rcu_read_unlock(); -} - -static int *perf_swcounter_recursion_context(struct perf_cpu_context *cpuctx) -{ - if (in_nmi()) - return &cpuctx->recursion[3]; - - if (in_irq()) - return &cpuctx->recursion[2]; - - if (in_softirq()) - return &cpuctx->recursion[1]; - - return &cpuctx->recursion[0]; -} - -static void do_perf_swcounter_event(enum perf_type_id type, u32 event, - u64 nr, int nmi, - struct perf_sample_data *data, - struct pt_regs *regs) -{ - struct perf_cpu_context *cpuctx = &get_cpu_var(perf_cpu_context); - int *recursion = perf_swcounter_recursion_context(cpuctx); - struct perf_counter_context *ctx; - - if (*recursion) - goto out; - - (*recursion)++; - barrier(); - - perf_swcounter_ctx_event(&cpuctx->ctx, type, event, - nr, nmi, data, regs); - rcu_read_lock(); - /* - * doesn't really matter which of the child contexts the - * events ends up in. - */ - ctx = rcu_dereference(current->perf_counter_ctxp); - if (ctx) - perf_swcounter_ctx_event(ctx, type, event, nr, nmi, data, regs); - rcu_read_unlock(); - - barrier(); - (*recursion)--; - -out: - put_cpu_var(perf_cpu_context); -} - -void __perf_swcounter_event(u32 event, u64 nr, int nmi, - struct pt_regs *regs, u64 addr) -{ - struct perf_sample_data data = { - .addr = addr, - }; - - do_perf_swcounter_event(PERF_TYPE_SOFTWARE, event, nr, nmi, - &data, regs); -} - -static void perf_swcounter_read(struct perf_counter *counter) -{ -} - -static int perf_swcounter_enable(struct perf_counter *counter) -{ - struct hw_perf_counter *hwc = &counter->hw; - - if (hwc->sample_period) { - hwc->last_period = hwc->sample_period; - perf_swcounter_set_period(counter); - } - return 0; -} - -static void perf_swcounter_disable(struct perf_counter *counter) -{ -} - -static const struct pmu perf_ops_generic = { - .enable = perf_swcounter_enable, - .disable = perf_swcounter_disable, - .read = perf_swcounter_read, - .unthrottle = perf_swcounter_unthrottle, -}; - -/* - * hrtimer based swcounter callback - */ - -static enum hrtimer_restart perf_swcounter_hrtimer(struct hrtimer *hrtimer) -{ - enum hrtimer_restart ret = HRTIMER_RESTART; - struct perf_sample_data data; - struct pt_regs *regs; - struct perf_counter *counter; - u64 period; - - counter = container_of(hrtimer, struct perf_counter, hw.hrtimer); - counter->pmu->read(counter); - - data.addr = 0; - regs = get_irq_regs(); - /* - * In case we exclude kernel IPs or are somehow not in interrupt - * context, provide the next best thing, the user IP. - */ - if ((counter->attr.exclude_kernel || !regs) && - !counter->attr.exclude_user) - regs = task_pt_regs(current); - - if (regs) { - if (perf_counter_overflow(counter, 0, &data, regs)) - ret = HRTIMER_NORESTART; - } - - period = max_t(u64, 10000, counter->hw.sample_period); - hrtimer_forward_now(hrtimer, ns_to_ktime(period)); - - return ret; -} - -/* - * Software counter: cpu wall time clock - */ - -static void cpu_clock_perf_counter_update(struct perf_counter *counter) -{ - int cpu = raw_smp_processor_id(); - s64 prev; - u64 now; - - now = cpu_clock(cpu); - prev = atomic64_read(&counter->hw.prev_count); - atomic64_set(&counter->hw.prev_count, now); - atomic64_add(now - prev, &counter->count); -} - -static int cpu_clock_perf_counter_enable(struct perf_counter *counter) -{ - struct hw_perf_counter *hwc = &counter->hw; - int cpu = raw_smp_processor_id(); - - atomic64_set(&hwc->prev_count, cpu_clock(cpu)); - hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); - hwc->hrtimer.function = perf_swcounter_hrtimer; - if (hwc->sample_period) { - u64 period = max_t(u64, 10000, hwc->sample_period); - __hrtimer_start_range_ns(&hwc->hrtimer, - ns_to_ktime(period), 0, - HRTIMER_MODE_REL, 0); - } - - return 0; -} - -static void cpu_clock_perf_counter_disable(struct perf_counter *counter) -{ - if (counter->hw.sample_period) - hrtimer_cancel(&counter->hw.hrtimer); - cpu_clock_perf_counter_update(counter); -} - -static void cpu_clock_perf_counter_read(struct perf_counter *counter) -{ - cpu_clock_perf_counter_update(counter); -} - -static const struct pmu perf_ops_cpu_clock = { - .enable = cpu_clock_perf_counter_enable, - .disable = cpu_clock_perf_counter_disable, - .read = cpu_clock_perf_counter_read, -}; - -/* - * Software counter: task time clock - */ - -static void task_clock_perf_counter_update(struct perf_counter *counter, u64 now) -{ - u64 prev; - s64 delta; - - prev = atomic64_xchg(&counter->hw.prev_count, now); - delta = now - prev; - atomic64_add(delta, &counter->count); -} - -static int task_clock_perf_counter_enable(struct perf_counter *counter) -{ - struct hw_perf_counter *hwc = &counter->hw; - u64 now; - - now = counter->ctx->time; - - atomic64_set(&hwc->prev_count, now); - hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); - hwc->hrtimer.function = perf_swcounter_hrtimer; - if (hwc->sample_period) { - u64 period = max_t(u64, 10000, hwc->sample_period); - __hrtimer_start_range_ns(&hwc->hrtimer, - ns_to_ktime(period), 0, - HRTIMER_MODE_REL, 0); - } - - return 0; -} - -static void task_clock_perf_counter_disable(struct perf_counter *counter) -{ - if (counter->hw.sample_period) - hrtimer_cancel(&counter->hw.hrtimer); - task_clock_perf_counter_update(counter, counter->ctx->time); - -} - -static void task_clock_perf_counter_read(struct perf_counter *counter) -{ - u64 time; - - if (!in_nmi()) { - update_context_time(counter->ctx); - time = counter->ctx->time; - } else { - u64 now = perf_clock(); - u64 delta = now - counter->ctx->timestamp; - time = counter->ctx->time + delta; - } - - task_clock_perf_counter_update(counter, time); -} - -static const struct pmu perf_ops_task_clock = { - .enable = task_clock_perf_counter_enable, - .disable = task_clock_perf_counter_disable, - .read = task_clock_perf_counter_read, -}; - -#ifdef CONFIG_EVENT_PROFILE -void perf_tpcounter_event(int event_id, u64 addr, u64 count, void *record, - int entry_size) -{ - struct perf_raw_record raw = { - .size = entry_size, - .data = record, - }; - - struct perf_sample_data data = { - .addr = addr, - .raw = &raw, - }; - - struct pt_regs *regs = get_irq_regs(); - - if (!regs) - regs = task_pt_regs(current); - - do_perf_swcounter_event(PERF_TYPE_TRACEPOINT, event_id, count, 1, - &data, regs); -} -EXPORT_SYMBOL_GPL(perf_tpcounter_event); - -extern int ftrace_profile_enable(int); -extern void ftrace_profile_disable(int); - -static void tp_perf_counter_destroy(struct perf_counter *counter) -{ - ftrace_profile_disable(counter->attr.config); -} - -static const struct pmu *tp_perf_counter_init(struct perf_counter *counter) -{ - /* - * Raw tracepoint data is a severe data leak, only allow root to - * have these. - */ - if ((counter->attr.sample_type & PERF_SAMPLE_RAW) && - perf_paranoid_tracepoint_raw() && - !capable(CAP_SYS_ADMIN)) - return ERR_PTR(-EPERM); - - if (ftrace_profile_enable(counter->attr.config)) - return NULL; - - counter->destroy = tp_perf_counter_destroy; - - return &perf_ops_generic; -} -#else -static const struct pmu *tp_perf_counter_init(struct perf_counter *counter) -{ - return NULL; -} -#endif - -atomic_t perf_swcounter_enabled[PERF_COUNT_SW_MAX]; - -static void sw_perf_counter_destroy(struct perf_counter *counter) -{ - u64 event_id = counter->attr.config; - - WARN_ON(counter->parent); - - atomic_dec(&perf_swcounter_enabled[event_id]); -} - -static const struct pmu *sw_perf_counter_init(struct perf_counter *counter) -{ - const struct pmu *pmu = NULL; - u64 event_id = counter->attr.config; - - /* - * Software counters (currently) can't in general distinguish - * between user, kernel and hypervisor events. - * However, context switches and cpu migrations are considered - * to be kernel events, and page faults are never hypervisor - * events. - */ - switch (event_id) { - case PERF_COUNT_SW_CPU_CLOCK: - pmu = &perf_ops_cpu_clock; - - break; - case PERF_COUNT_SW_TASK_CLOCK: - /* - * If the user instantiates this as a per-cpu counter, - * use the cpu_clock counter instead. - */ - if (counter->ctx->task) - pmu = &perf_ops_task_clock; - else - pmu = &perf_ops_cpu_clock; - - break; - case PERF_COUNT_SW_PAGE_FAULTS: - case PERF_COUNT_SW_PAGE_FAULTS_MIN: - case PERF_COUNT_SW_PAGE_FAULTS_MAJ: - case PERF_COUNT_SW_CONTEXT_SWITCHES: - case PERF_COUNT_SW_CPU_MIGRATIONS: - if (!counter->parent) { - atomic_inc(&perf_swcounter_enabled[event_id]); - counter->destroy = sw_perf_counter_destroy; - } - pmu = &perf_ops_generic; - break; - } - - return pmu; -} - -/* - * Allocate and initialize a counter structure - */ -static struct perf_counter * -perf_counter_alloc(struct perf_counter_attr *attr, - int cpu, - struct perf_counter_context *ctx, - struct perf_counter *group_leader, - struct perf_counter *parent_counter, - gfp_t gfpflags) -{ - const struct pmu *pmu; - struct perf_counter *counter; - struct hw_perf_counter *hwc; - long err; - - counter = kzalloc(sizeof(*counter), gfpflags); - if (!counter) - return ERR_PTR(-ENOMEM); - - /* - * Single counters are their own group leaders, with an - * empty sibling list: - */ - if (!group_leader) - group_leader = counter; - - mutex_init(&counter->child_mutex); - INIT_LIST_HEAD(&counter->child_list); - - INIT_LIST_HEAD(&counter->group_entry); - INIT_LIST_HEAD(&counter->event_entry); - INIT_LIST_HEAD(&counter->sibling_list); - init_waitqueue_head(&counter->waitq); - - mutex_init(&counter->mmap_mutex); - - counter->cpu = cpu; - counter->attr = *attr; - counter->group_leader = group_leader; - counter->pmu = NULL; - counter->ctx = ctx; - counter->oncpu = -1; - - counter->parent = parent_counter; - - counter->ns = get_pid_ns(current->nsproxy->pid_ns); - counter->id = atomic64_inc_return(&perf_counter_id); - - counter->state = PERF_COUNTER_STATE_INACTIVE; - - if (attr->disabled) - counter->state = PERF_COUNTER_STATE_OFF; - - pmu = NULL; - - hwc = &counter->hw; - hwc->sample_period = attr->sample_period; - if (attr->freq && attr->sample_freq) - hwc->sample_period = 1; - hwc->last_period = hwc->sample_period; - - atomic64_set(&hwc->period_left, hwc->sample_period); - - /* - * we currently do not support PERF_FORMAT_GROUP on inherited counters - */ - if (attr->inherit && (attr->read_format & PERF_FORMAT_GROUP)) - goto done; - - switch (attr->type) { - case PERF_TYPE_RAW: - case PERF_TYPE_HARDWARE: - case PERF_TYPE_HW_CACHE: - pmu = hw_perf_counter_init(counter); - break; - - case PERF_TYPE_SOFTWARE: - pmu = sw_perf_counter_init(counter); - break; - - case PERF_TYPE_TRACEPOINT: - pmu = tp_perf_counter_init(counter); - break; - - default: - break; - } -done: - err = 0; - if (!pmu) - err = -EINVAL; - else if (IS_ERR(pmu)) - err = PTR_ERR(pmu); - - if (err) { - if (counter->ns) - put_pid_ns(counter->ns); - kfree(counter); - return ERR_PTR(err); - } - - counter->pmu = pmu; - - if (!counter->parent) { - atomic_inc(&nr_counters); - if (counter->attr.mmap) - atomic_inc(&nr_mmap_counters); - if (counter->attr.comm) - atomic_inc(&nr_comm_counters); - if (counter->attr.task) - atomic_inc(&nr_task_counters); - } - - return counter; -} - -static int perf_copy_attr(struct perf_counter_attr __user *uattr, - struct perf_counter_attr *attr) -{ - u32 size; - int ret; - - if (!access_ok(VERIFY_WRITE, uattr, PERF_ATTR_SIZE_VER0)) - return -EFAULT; - - /* - * zero the full structure, so that a short copy will be nice. - */ - memset(attr, 0, sizeof(*attr)); - - ret = get_user(size, &uattr->size); - if (ret) - return ret; - - if (size > PAGE_SIZE) /* silly large */ - goto err_size; - - if (!size) /* abi compat */ - size = PERF_ATTR_SIZE_VER0; - - if (size < PERF_ATTR_SIZE_VER0) - goto err_size; - - /* - * If we're handed a bigger struct than we know of, - * ensure all the unknown bits are 0 - i.e. new - * user-space does not rely on any kernel feature - * extensions we dont know about yet. - */ - if (size > sizeof(*attr)) { - unsigned char __user *addr; - unsigned char __user *end; - unsigned char val; - - addr = (void __user *)uattr + sizeof(*attr); - end = (void __user *)uattr + size; - - for (; addr < end; addr++) { - ret = get_user(val, addr); - if (ret) - return ret; - if (val) - goto err_size; - } - size = sizeof(*attr); - } - - ret = copy_from_user(attr, uattr, size); - if (ret) - return -EFAULT; - - /* - * If the type exists, the corresponding creation will verify - * the attr->config. - */ - if (attr->type >= PERF_TYPE_MAX) - return -EINVAL; - - if (attr->__reserved_1 || attr->__reserved_2 || attr->__reserved_3) - return -EINVAL; - - if (attr->sample_type & ~(PERF_SAMPLE_MAX-1)) - return -EINVAL; - - if (attr->read_format & ~(PERF_FORMAT_MAX-1)) - return -EINVAL; - -out: - return ret; - -err_size: - put_user(sizeof(*attr), &uattr->size); - ret = -E2BIG; - goto out; -} - -int perf_counter_set_output(struct perf_counter *counter, int output_fd) -{ - struct perf_counter *output_counter = NULL; - struct file *output_file = NULL; - struct perf_counter *old_output; - int fput_needed = 0; - int ret = -EINVAL; - - if (!output_fd) - goto set; - - output_file = fget_light(output_fd, &fput_needed); - if (!output_file) - return -EBADF; - - if (output_file->f_op != &perf_fops) - goto out; - - output_counter = output_file->private_data; - - /* Don't chain output fds */ - if (output_counter->output) - goto out; - - /* Don't set an output fd when we already have an output channel */ - if (counter->data) - goto out; - - atomic_long_inc(&output_file->f_count); - -set: - mutex_lock(&counter->mmap_mutex); - old_output = counter->output; - rcu_assign_pointer(counter->output, output_counter); - mutex_unlock(&counter->mmap_mutex); - - if (old_output) { - /* - * we need to make sure no existing perf_output_*() - * is still referencing this counter. - */ - synchronize_rcu(); - fput(old_output->filp); - } - - ret = 0; -out: - fput_light(output_file, fput_needed); - return ret; -} - -/** - * sys_perf_counter_open - open a performance counter, associate it to a task/cpu - * - * @attr_uptr: event type attributes for monitoring/sampling - * @pid: target pid - * @cpu: target cpu - * @group_fd: group leader counter fd - */ -SYSCALL_DEFINE5(perf_counter_open, - struct perf_counter_attr __user *, attr_uptr, - pid_t, pid, int, cpu, int, group_fd, unsigned long, flags) -{ - struct perf_counter *counter, *group_leader; - struct perf_counter_attr attr; - struct perf_counter_context *ctx; - struct file *counter_file = NULL; - struct file *group_file = NULL; - int fput_needed = 0; - int fput_needed2 = 0; - int err; - - /* for future expandability... */ - if (flags & ~(PERF_FLAG_FD_NO_GROUP | PERF_FLAG_FD_OUTPUT)) - return -EINVAL; - - err = perf_copy_attr(attr_uptr, &attr); - if (err) - return err; - - if (!attr.exclude_kernel) { - if (perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN)) - return -EACCES; - } - - if (attr.freq) { - if (attr.sample_freq > sysctl_perf_counter_sample_rate) - return -EINVAL; - } - - /* - * Get the target context (task or percpu): - */ - ctx = find_get_context(pid, cpu); - if (IS_ERR(ctx)) - return PTR_ERR(ctx); - - /* - * Look up the group leader (we will attach this counter to it): - */ - group_leader = NULL; - if (group_fd != -1 && !(flags & PERF_FLAG_FD_NO_GROUP)) { - err = -EINVAL; - group_file = fget_light(group_fd, &fput_needed); - if (!group_file) - goto err_put_context; - if (group_file->f_op != &perf_fops) - goto err_put_context; - - group_leader = group_file->private_data; - /* - * Do not allow a recursive hierarchy (this new sibling - * becoming part of another group-sibling): - */ - if (group_leader->group_leader != group_leader) - goto err_put_context; - /* - * Do not allow to attach to a group in a different - * task or CPU context: - */ - if (group_leader->ctx != ctx) - goto err_put_context; - /* - * Only a group leader can be exclusive or pinned - */ - if (attr.exclusive || attr.pinned) - goto err_put_context; - } - - counter = perf_counter_alloc(&attr, cpu, ctx, group_leader, - NULL, GFP_KERNEL); - err = PTR_ERR(counter); - if (IS_ERR(counter)) - goto err_put_context; - - err = anon_inode_getfd("[perf_counter]", &perf_fops, counter, 0); - if (err < 0) - goto err_free_put_context; - - counter_file = fget_light(err, &fput_needed2); - if (!counter_file) - goto err_free_put_context; - - if (flags & PERF_FLAG_FD_OUTPUT) { - err = perf_counter_set_output(counter, group_fd); - if (err) - goto err_fput_free_put_context; - } - - counter->filp = counter_file; - WARN_ON_ONCE(ctx->parent_ctx); - mutex_lock(&ctx->mutex); - perf_install_in_context(ctx, counter, cpu); - ++ctx->generation; - mutex_unlock(&ctx->mutex); - - counter->owner = current; - get_task_struct(current); - mutex_lock(¤t->perf_counter_mutex); - list_add_tail(&counter->owner_entry, ¤t->perf_counter_list); - mutex_unlock(¤t->perf_counter_mutex); - -err_fput_free_put_context: - fput_light(counter_file, fput_needed2); - -err_free_put_context: - if (err < 0) - kfree(counter); - -err_put_context: - if (err < 0) - put_ctx(ctx); - - fput_light(group_file, fput_needed); - - return err; -} - -/* - * inherit a counter from parent task to child task: - */ -static struct perf_counter * -inherit_counter(struct perf_counter *parent_counter, - struct task_struct *parent, - struct perf_counter_context *parent_ctx, - struct task_struct *child, - struct perf_counter *group_leader, - struct perf_counter_context *child_ctx) -{ - struct perf_counter *child_counter; - - /* - * Instead of creating recursive hierarchies of counters, - * we link inherited counters back to the original parent, - * which has a filp for sure, which we use as the reference - * count: - */ - if (parent_counter->parent) - parent_counter = parent_counter->parent; - - child_counter = perf_counter_alloc(&parent_counter->attr, - parent_counter->cpu, child_ctx, - group_leader, parent_counter, - GFP_KERNEL); - if (IS_ERR(child_counter)) - return child_counter; - get_ctx(child_ctx); - - /* - * Make the child state follow the state of the parent counter, - * not its attr.disabled bit. We hold the parent's mutex, - * so we won't race with perf_counter_{en, dis}able_family. - */ - if (parent_counter->state >= PERF_COUNTER_STATE_INACTIVE) - child_counter->state = PERF_COUNTER_STATE_INACTIVE; - else - child_counter->state = PERF_COUNTER_STATE_OFF; - - if (parent_counter->attr.freq) - child_counter->hw.sample_period = parent_counter->hw.sample_period; - - /* - * Link it up in the child's context: - */ - add_counter_to_ctx(child_counter, child_ctx); - - /* - * Get a reference to the parent filp - we will fput it - * when the child counter exits. This is safe to do because - * we are in the parent and we know that the filp still - * exists and has a nonzero count: - */ - atomic_long_inc(&parent_counter->filp->f_count); - - /* - * Link this into the parent counter's child list - */ - WARN_ON_ONCE(parent_counter->ctx->parent_ctx); - mutex_lock(&parent_counter->child_mutex); - list_add_tail(&child_counter->child_list, &parent_counter->child_list); - mutex_unlock(&parent_counter->child_mutex); - - return child_counter; -} - -static int inherit_group(struct perf_counter *parent_counter, - struct task_struct *parent, - struct perf_counter_context *parent_ctx, - struct task_struct *child, - struct perf_counter_context *child_ctx) -{ - struct perf_counter *leader; - struct perf_counter *sub; - struct perf_counter *child_ctr; - - leader = inherit_counter(parent_counter, parent, parent_ctx, - child, NULL, child_ctx); - if (IS_ERR(leader)) - return PTR_ERR(leader); - list_for_each_entry(sub, &parent_counter->sibling_list, group_entry) { - child_ctr = inherit_counter(sub, parent, parent_ctx, - child, leader, child_ctx); - if (IS_ERR(child_ctr)) - return PTR_ERR(child_ctr); - } - return 0; -} - -static void sync_child_counter(struct perf_counter *child_counter, - struct task_struct *child) -{ - struct perf_counter *parent_counter = child_counter->parent; - u64 child_val; - - if (child_counter->attr.inherit_stat) - perf_counter_read_event(child_counter, child); - - child_val = atomic64_read(&child_counter->count); - - /* - * Add back the child's count to the parent's count: - */ - atomic64_add(child_val, &parent_counter->count); - atomic64_add(child_counter->total_time_enabled, - &parent_counter->child_total_time_enabled); - atomic64_add(child_counter->total_time_running, - &parent_counter->child_total_time_running); - - /* - * Remove this counter from the parent's list - */ - WARN_ON_ONCE(parent_counter->ctx->parent_ctx); - mutex_lock(&parent_counter->child_mutex); - list_del_init(&child_counter->child_list); - mutex_unlock(&parent_counter->child_mutex); - - /* - * Release the parent counter, if this was the last - * reference to it. - */ - fput(parent_counter->filp); -} - -static void -__perf_counter_exit_task(struct perf_counter *child_counter, - struct perf_counter_context *child_ctx, - struct task_struct *child) -{ - struct perf_counter *parent_counter; - - update_counter_times(child_counter); - perf_counter_remove_from_context(child_counter); - - parent_counter = child_counter->parent; - /* - * It can happen that parent exits first, and has counters - * that are still around due to the child reference. These - * counters need to be zapped - but otherwise linger. - */ - if (parent_counter) { - sync_child_counter(child_counter, child); - free_counter(child_counter); - } -} - -/* - * When a child task exits, feed back counter values to parent counters. - */ -void perf_counter_exit_task(struct task_struct *child) -{ - struct perf_counter *child_counter, *tmp; - struct perf_counter_context *child_ctx; - unsigned long flags; - - if (likely(!child->perf_counter_ctxp)) { - perf_counter_task(child, NULL, 0); - return; - } - - local_irq_save(flags); - /* - * We can't reschedule here because interrupts are disabled, - * and either child is current or it is a task that can't be - * scheduled, so we are now safe from rescheduling changing - * our context. - */ - child_ctx = child->perf_counter_ctxp; - __perf_counter_task_sched_out(child_ctx); - - /* - * Take the context lock here so that if find_get_context is - * reading child->perf_counter_ctxp, we wait until it has - * incremented the context's refcount before we do put_ctx below. - */ - spin_lock(&child_ctx->lock); - child->perf_counter_ctxp = NULL; - /* - * If this context is a clone; unclone it so it can't get - * swapped to another process while we're removing all - * the counters from it. - */ - unclone_ctx(child_ctx); - spin_unlock_irqrestore(&child_ctx->lock, flags); - - /* - * Report the task dead after unscheduling the counters so that we - * won't get any samples after PERF_EVENT_EXIT. We can however still - * get a few PERF_EVENT_READ events. - */ - perf_counter_task(child, child_ctx, 0); - - /* - * We can recurse on the same lock type through: - * - * __perf_counter_exit_task() - * sync_child_counter() - * fput(parent_counter->filp) - * perf_release() - * mutex_lock(&ctx->mutex) - * - * But since its the parent context it won't be the same instance. - */ - mutex_lock_nested(&child_ctx->mutex, SINGLE_DEPTH_NESTING); - -again: - list_for_each_entry_safe(child_counter, tmp, &child_ctx->group_list, - group_entry) - __perf_counter_exit_task(child_counter, child_ctx, child); - - /* - * If the last counter was a group counter, it will have appended all - * its siblings to the list, but we obtained 'tmp' before that which - * will still point to the list head terminating the iteration. - */ - if (!list_empty(&child_ctx->group_list)) - goto again; - - mutex_unlock(&child_ctx->mutex); - - put_ctx(child_ctx); -} - -/* - * free an unexposed, unused context as created by inheritance by - * init_task below, used by fork() in case of fail. - */ -void perf_counter_free_task(struct task_struct *task) -{ - struct perf_counter_context *ctx = task->perf_counter_ctxp; - struct perf_counter *counter, *tmp; - - if (!ctx) - return; - - mutex_lock(&ctx->mutex); -again: - list_for_each_entry_safe(counter, tmp, &ctx->group_list, group_entry) { - struct perf_counter *parent = counter->parent; - - if (WARN_ON_ONCE(!parent)) - continue; - - mutex_lock(&parent->child_mutex); - list_del_init(&counter->child_list); - mutex_unlock(&parent->child_mutex); - - fput(parent->filp); - - list_del_counter(counter, ctx); - free_counter(counter); - } - - if (!list_empty(&ctx->group_list)) - goto again; - - mutex_unlock(&ctx->mutex); - - put_ctx(ctx); -} - -/* - * Initialize the perf_counter context in task_struct - */ -int perf_counter_init_task(struct task_struct *child) -{ - struct perf_counter_context *child_ctx, *parent_ctx; - struct perf_counter_context *cloned_ctx; - struct perf_counter *counter; - struct task_struct *parent = current; - int inherited_all = 1; - int ret = 0; - - child->perf_counter_ctxp = NULL; - - mutex_init(&child->perf_counter_mutex); - INIT_LIST_HEAD(&child->perf_counter_list); - - if (likely(!parent->perf_counter_ctxp)) - return 0; - - /* - * This is executed from the parent task context, so inherit - * counters that have been marked for cloning. - * First allocate and initialize a context for the child. - */ - - child_ctx = kmalloc(sizeof(struct perf_counter_context), GFP_KERNEL); - if (!child_ctx) - return -ENOMEM; - - __perf_counter_init_context(child_ctx, child); - child->perf_counter_ctxp = child_ctx; - get_task_struct(child); - - /* - * If the parent's context is a clone, pin it so it won't get - * swapped under us. - */ - parent_ctx = perf_pin_task_context(parent); - - /* - * No need to check if parent_ctx != NULL here; since we saw - * it non-NULL earlier, the only reason for it to become NULL - * is if we exit, and since we're currently in the middle of - * a fork we can't be exiting at the same time. - */ - - /* - * Lock the parent list. No need to lock the child - not PID - * hashed yet and not running, so nobody can access it. - */ - mutex_lock(&parent_ctx->mutex); - - /* - * We dont have to disable NMIs - we are only looking at - * the list, not manipulating it: - */ - list_for_each_entry_rcu(counter, &parent_ctx->event_list, event_entry) { - if (counter != counter->group_leader) - continue; - - if (!counter->attr.inherit) { - inherited_all = 0; - continue; - } - - ret = inherit_group(counter, parent, parent_ctx, - child, child_ctx); - if (ret) { - inherited_all = 0; - break; - } - } - - if (inherited_all) { - /* - * Mark the child context as a clone of the parent - * context, or of whatever the parent is a clone of. - * Note that if the parent is a clone, it could get - * uncloned at any point, but that doesn't matter - * because the list of counters and the generation - * count can't have changed since we took the mutex. - */ - cloned_ctx = rcu_dereference(parent_ctx->parent_ctx); - if (cloned_ctx) { - child_ctx->parent_ctx = cloned_ctx; - child_ctx->parent_gen = parent_ctx->parent_gen; - } else { - child_ctx->parent_ctx = parent_ctx; - child_ctx->parent_gen = parent_ctx->generation; - } - get_ctx(child_ctx->parent_ctx); - } - - mutex_unlock(&parent_ctx->mutex); - - perf_unpin_context(parent_ctx); - - return ret; -} - -static void __cpuinit perf_counter_init_cpu(int cpu) -{ - struct perf_cpu_context *cpuctx; - - cpuctx = &per_cpu(perf_cpu_context, cpu); - __perf_counter_init_context(&cpuctx->ctx, NULL); - - spin_lock(&perf_resource_lock); - cpuctx->max_pertask = perf_max_counters - perf_reserved_percpu; - spin_unlock(&perf_resource_lock); - - hw_perf_counter_setup(cpu); -} - -#ifdef CONFIG_HOTPLUG_CPU -static void __perf_counter_exit_cpu(void *info) -{ - struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); - struct perf_counter_context *ctx = &cpuctx->ctx; - struct perf_counter *counter, *tmp; - - list_for_each_entry_safe(counter, tmp, &ctx->group_list, group_entry) - __perf_counter_remove_from_context(counter); -} -static void perf_counter_exit_cpu(int cpu) -{ - struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu); - struct perf_counter_context *ctx = &cpuctx->ctx; - - mutex_lock(&ctx->mutex); - smp_call_function_single(cpu, __perf_counter_exit_cpu, NULL, 1); - mutex_unlock(&ctx->mutex); -} -#else -static inline void perf_counter_exit_cpu(int cpu) { } -#endif - -static int __cpuinit -perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu) -{ - unsigned int cpu = (long)hcpu; - - switch (action) { - - case CPU_UP_PREPARE: - case CPU_UP_PREPARE_FROZEN: - perf_counter_init_cpu(cpu); - break; - - case CPU_ONLINE: - case CPU_ONLINE_FROZEN: - hw_perf_counter_setup_online(cpu); - break; - - case CPU_DOWN_PREPARE: - case CPU_DOWN_PREPARE_FROZEN: - perf_counter_exit_cpu(cpu); - break; - - default: - break; - } - - return NOTIFY_OK; -} - -/* - * This has to have a higher priority than migration_notifier in sched.c. - */ -static struct notifier_block __cpuinitdata perf_cpu_nb = { - .notifier_call = perf_cpu_notify, - .priority = 20, -}; - -void __init perf_counter_init(void) -{ - perf_cpu_notify(&perf_cpu_nb, (unsigned long)CPU_UP_PREPARE, - (void *)(long)smp_processor_id()); - perf_cpu_notify(&perf_cpu_nb, (unsigned long)CPU_ONLINE, - (void *)(long)smp_processor_id()); - register_cpu_notifier(&perf_cpu_nb); -} - -static ssize_t perf_show_reserve_percpu(struct sysdev_class *class, char *buf) -{ - return sprintf(buf, "%d\n", perf_reserved_percpu); -} - -static ssize_t -perf_set_reserve_percpu(struct sysdev_class *class, - const char *buf, - size_t count) -{ - struct perf_cpu_context *cpuctx; - unsigned long val; - int err, cpu, mpt; - - err = strict_strtoul(buf, 10, &val); - if (err) - return err; - if (val > perf_max_counters) - return -EINVAL; - - spin_lock(&perf_resource_lock); - perf_reserved_percpu = val; - for_each_online_cpu(cpu) { - cpuctx = &per_cpu(perf_cpu_context, cpu); - spin_lock_irq(&cpuctx->ctx.lock); - mpt = min(perf_max_counters - cpuctx->ctx.nr_counters, - perf_max_counters - perf_reserved_percpu); - cpuctx->max_pertask = mpt; - spin_unlock_irq(&cpuctx->ctx.lock); - } - spin_unlock(&perf_resource_lock); - - return count; -} - -static ssize_t perf_show_overcommit(struct sysdev_class *class, char *buf) -{ - return sprintf(buf, "%d\n", perf_overcommit); -} - -static ssize_t -perf_set_overcommit(struct sysdev_class *class, const char *buf, size_t count) -{ - unsigned long val; - int err; - - err = strict_strtoul(buf, 10, &val); - if (err) - return err; - if (val > 1) - return -EINVAL; - - spin_lock(&perf_resource_lock); - perf_overcommit = val; - spin_unlock(&perf_resource_lock); - - return count; -} - -static SYSDEV_CLASS_ATTR( - reserve_percpu, - 0644, - perf_show_reserve_percpu, - perf_set_reserve_percpu - ); - -static SYSDEV_CLASS_ATTR( - overcommit, - 0644, - perf_show_overcommit, - perf_set_overcommit - ); - -static struct attribute *perfclass_attrs[] = { - &attr_reserve_percpu.attr, - &attr_overcommit.attr, - NULL -}; - -static struct attribute_group perfclass_attr_group = { - .attrs = perfclass_attrs, - .name = "perf_counters", -}; - -static int __init perf_counter_sysfs_init(void) -{ - return sysfs_create_group(&cpu_sysdev_class.kset.kobj, - &perfclass_attr_group); -} -device_initcall(perf_counter_sysfs_init); diff --git a/kernel/perf_event.c b/kernel/perf_event.c new file mode 100644 index 000000000000..6e8b99a04e1e --- /dev/null +++ b/kernel/perf_event.c @@ -0,0 +1,5000 @@ +/* + * Performance event core code + * + * Copyright (C) 2008 Thomas Gleixner + * Copyright (C) 2008-2009 Red Hat, Inc., Ingo Molnar + * Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra + * Copyright © 2009 Paul Mackerras, IBM Corp. + * + * For licensing details see kernel-base/COPYING + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +/* + * Each CPU has a list of per CPU events: + */ +DEFINE_PER_CPU(struct perf_cpu_context, perf_cpu_context); + +int perf_max_events __read_mostly = 1; +static int perf_reserved_percpu __read_mostly; +static int perf_overcommit __read_mostly = 1; + +static atomic_t nr_events __read_mostly; +static atomic_t nr_mmap_events __read_mostly; +static atomic_t nr_comm_events __read_mostly; +static atomic_t nr_task_events __read_mostly; + +/* + * perf event paranoia level: + * -1 - not paranoid at all + * 0 - disallow raw tracepoint access for unpriv + * 1 - disallow cpu events for unpriv + * 2 - disallow kernel profiling for unpriv + */ +int sysctl_perf_event_paranoid __read_mostly = 1; + +static inline bool perf_paranoid_tracepoint_raw(void) +{ + return sysctl_perf_event_paranoid > -1; +} + +static inline bool perf_paranoid_cpu(void) +{ + return sysctl_perf_event_paranoid > 0; +} + +static inline bool perf_paranoid_kernel(void) +{ + return sysctl_perf_event_paranoid > 1; +} + +int sysctl_perf_event_mlock __read_mostly = 512; /* 'free' kb per user */ + +/* + * max perf event sample rate + */ +int sysctl_perf_event_sample_rate __read_mostly = 100000; + +static atomic64_t perf_event_id; + +/* + * Lock for (sysadmin-configurable) event reservations: + */ +static DEFINE_SPINLOCK(perf_resource_lock); + +/* + * Architecture provided APIs - weak aliases: + */ +extern __weak const struct pmu *hw_perf_event_init(struct perf_event *event) +{ + return NULL; +} + +void __weak hw_perf_disable(void) { barrier(); } +void __weak hw_perf_enable(void) { barrier(); } + +void __weak hw_perf_event_setup(int cpu) { barrier(); } +void __weak hw_perf_event_setup_online(int cpu) { barrier(); } + +int __weak +hw_perf_group_sched_in(struct perf_event *group_leader, + struct perf_cpu_context *cpuctx, + struct perf_event_context *ctx, int cpu) +{ + return 0; +} + +void __weak perf_event_print_debug(void) { } + +static DEFINE_PER_CPU(int, perf_disable_count); + +void __perf_disable(void) +{ + __get_cpu_var(perf_disable_count)++; +} + +bool __perf_enable(void) +{ + return !--__get_cpu_var(perf_disable_count); +} + +void perf_disable(void) +{ + __perf_disable(); + hw_perf_disable(); +} + +void perf_enable(void) +{ + if (__perf_enable()) + hw_perf_enable(); +} + +static void get_ctx(struct perf_event_context *ctx) +{ + WARN_ON(!atomic_inc_not_zero(&ctx->refcount)); +} + +static void free_ctx(struct rcu_head *head) +{ + struct perf_event_context *ctx; + + ctx = container_of(head, struct perf_event_context, rcu_head); + kfree(ctx); +} + +static void put_ctx(struct perf_event_context *ctx) +{ + if (atomic_dec_and_test(&ctx->refcount)) { + if (ctx->parent_ctx) + put_ctx(ctx->parent_ctx); + if (ctx->task) + put_task_struct(ctx->task); + call_rcu(&ctx->rcu_head, free_ctx); + } +} + +static void unclone_ctx(struct perf_event_context *ctx) +{ + if (ctx->parent_ctx) { + put_ctx(ctx->parent_ctx); + ctx->parent_ctx = NULL; + } +} + +/* + * If we inherit events we want to return the parent event id + * to userspace. + */ +static u64 primary_event_id(struct perf_event *event) +{ + u64 id = event->id; + + if (event->parent) + id = event->parent->id; + + return id; +} + +/* + * Get the perf_event_context for a task and lock it. + * This has to cope with with the fact that until it is locked, + * the context could get moved to another task. + */ +static struct perf_event_context * +perf_lock_task_context(struct task_struct *task, unsigned long *flags) +{ + struct perf_event_context *ctx; + + rcu_read_lock(); + retry: + ctx = rcu_dereference(task->perf_event_ctxp); + if (ctx) { + /* + * If this context is a clone of another, it might + * get swapped for another underneath us by + * perf_event_task_sched_out, though the + * rcu_read_lock() protects us from any context + * getting freed. Lock the context and check if it + * got swapped before we could get the lock, and retry + * if so. If we locked the right context, then it + * can't get swapped on us any more. + */ + spin_lock_irqsave(&ctx->lock, *flags); + if (ctx != rcu_dereference(task->perf_event_ctxp)) { + spin_unlock_irqrestore(&ctx->lock, *flags); + goto retry; + } + + if (!atomic_inc_not_zero(&ctx->refcount)) { + spin_unlock_irqrestore(&ctx->lock, *flags); + ctx = NULL; + } + } + rcu_read_unlock(); + return ctx; +} + +/* + * Get the context for a task and increment its pin_count so it + * can't get swapped to another task. This also increments its + * reference count so that the context can't get freed. + */ +static struct perf_event_context *perf_pin_task_context(struct task_struct *task) +{ + struct perf_event_context *ctx; + unsigned long flags; + + ctx = perf_lock_task_context(task, &flags); + if (ctx) { + ++ctx->pin_count; + spin_unlock_irqrestore(&ctx->lock, flags); + } + return ctx; +} + +static void perf_unpin_context(struct perf_event_context *ctx) +{ + unsigned long flags; + + spin_lock_irqsave(&ctx->lock, flags); + --ctx->pin_count; + spin_unlock_irqrestore(&ctx->lock, flags); + put_ctx(ctx); +} + +/* + * Add a event from the lists for its context. + * Must be called with ctx->mutex and ctx->lock held. + */ +static void +list_add_event(struct perf_event *event, struct perf_event_context *ctx) +{ + struct perf_event *group_leader = event->group_leader; + + /* + * Depending on whether it is a standalone or sibling event, + * add it straight to the context's event list, or to the group + * leader's sibling list: + */ + if (group_leader == event) + list_add_tail(&event->group_entry, &ctx->group_list); + else { + list_add_tail(&event->group_entry, &group_leader->sibling_list); + group_leader->nr_siblings++; + } + + list_add_rcu(&event->event_entry, &ctx->event_list); + ctx->nr_events++; + if (event->attr.inherit_stat) + ctx->nr_stat++; +} + +/* + * Remove a event from the lists for its context. + * Must be called with ctx->mutex and ctx->lock held. + */ +static void +list_del_event(struct perf_event *event, struct perf_event_context *ctx) +{ + struct perf_event *sibling, *tmp; + + if (list_empty(&event->group_entry)) + return; + ctx->nr_events--; + if (event->attr.inherit_stat) + ctx->nr_stat--; + + list_del_init(&event->group_entry); + list_del_rcu(&event->event_entry); + + if (event->group_leader != event) + event->group_leader->nr_siblings--; + + /* + * If this was a group event with sibling events then + * upgrade the siblings to singleton events by adding them + * to the context list directly: + */ + list_for_each_entry_safe(sibling, tmp, &event->sibling_list, group_entry) { + + list_move_tail(&sibling->group_entry, &ctx->group_list); + sibling->group_leader = sibling; + } +} + +static void +event_sched_out(struct perf_event *event, + struct perf_cpu_context *cpuctx, + struct perf_event_context *ctx) +{ + if (event->state != PERF_EVENT_STATE_ACTIVE) + return; + + event->state = PERF_EVENT_STATE_INACTIVE; + if (event->pending_disable) { + event->pending_disable = 0; + event->state = PERF_EVENT_STATE_OFF; + } + event->tstamp_stopped = ctx->time; + event->pmu->disable(event); + event->oncpu = -1; + + if (!is_software_event(event)) + cpuctx->active_oncpu--; + ctx->nr_active--; + if (event->attr.exclusive || !cpuctx->active_oncpu) + cpuctx->exclusive = 0; +} + +static void +group_sched_out(struct perf_event *group_event, + struct perf_cpu_context *cpuctx, + struct perf_event_context *ctx) +{ + struct perf_event *event; + + if (group_event->state != PERF_EVENT_STATE_ACTIVE) + return; + + event_sched_out(group_event, cpuctx, ctx); + + /* + * Schedule out siblings (if any): + */ + list_for_each_entry(event, &group_event->sibling_list, group_entry) + event_sched_out(event, cpuctx, ctx); + + if (group_event->attr.exclusive) + cpuctx->exclusive = 0; +} + +/* + * Cross CPU call to remove a performance event + * + * We disable the event on the hardware level first. After that we + * remove it from the context list. + */ +static void __perf_event_remove_from_context(void *info) +{ + struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); + struct perf_event *event = info; + struct perf_event_context *ctx = event->ctx; + + /* + * If this is a task context, we need to check whether it is + * the current task context of this cpu. If not it has been + * scheduled out before the smp call arrived. + */ + if (ctx->task && cpuctx->task_ctx != ctx) + return; + + spin_lock(&ctx->lock); + /* + * Protect the list operation against NMI by disabling the + * events on a global level. + */ + perf_disable(); + + event_sched_out(event, cpuctx, ctx); + + list_del_event(event, ctx); + + if (!ctx->task) { + /* + * Allow more per task events with respect to the + * reservation: + */ + cpuctx->max_pertask = + min(perf_max_events - ctx->nr_events, + perf_max_events - perf_reserved_percpu); + } + + perf_enable(); + spin_unlock(&ctx->lock); +} + + +/* + * Remove the event from a task's (or a CPU's) list of events. + * + * Must be called with ctx->mutex held. + * + * CPU events are removed with a smp call. For task events we only + * call when the task is on a CPU. + * + * If event->ctx is a cloned context, callers must make sure that + * every task struct that event->ctx->task could possibly point to + * remains valid. This is OK when called from perf_release since + * that only calls us on the top-level context, which can't be a clone. + * When called from perf_event_exit_task, it's OK because the + * context has been detached from its task. + */ +static void perf_event_remove_from_context(struct perf_event *event) +{ + struct perf_event_context *ctx = event->ctx; + struct task_struct *task = ctx->task; + + if (!task) { + /* + * Per cpu events are removed via an smp call and + * the removal is always sucessful. + */ + smp_call_function_single(event->cpu, + __perf_event_remove_from_context, + event, 1); + return; + } + +retry: + task_oncpu_function_call(task, __perf_event_remove_from_context, + event); + + spin_lock_irq(&ctx->lock); + /* + * If the context is active we need to retry the smp call. + */ + if (ctx->nr_active && !list_empty(&event->group_entry)) { + spin_unlock_irq(&ctx->lock); + goto retry; + } + + /* + * The lock prevents that this context is scheduled in so we + * can remove the event safely, if the call above did not + * succeed. + */ + if (!list_empty(&event->group_entry)) { + list_del_event(event, ctx); + } + spin_unlock_irq(&ctx->lock); +} + +static inline u64 perf_clock(void) +{ + return cpu_clock(smp_processor_id()); +} + +/* + * Update the record of the current time in a context. + */ +static void update_context_time(struct perf_event_context *ctx) +{ + u64 now = perf_clock(); + + ctx->time += now - ctx->timestamp; + ctx->timestamp = now; +} + +/* + * Update the total_time_enabled and total_time_running fields for a event. + */ +static void update_event_times(struct perf_event *event) +{ + struct perf_event_context *ctx = event->ctx; + u64 run_end; + + if (event->state < PERF_EVENT_STATE_INACTIVE || + event->group_leader->state < PERF_EVENT_STATE_INACTIVE) + return; + + event->total_time_enabled = ctx->time - event->tstamp_enabled; + + if (event->state == PERF_EVENT_STATE_INACTIVE) + run_end = event->tstamp_stopped; + else + run_end = ctx->time; + + event->total_time_running = run_end - event->tstamp_running; +} + +/* + * Update total_time_enabled and total_time_running for all events in a group. + */ +static void update_group_times(struct perf_event *leader) +{ + struct perf_event *event; + + update_event_times(leader); + list_for_each_entry(event, &leader->sibling_list, group_entry) + update_event_times(event); +} + +/* + * Cross CPU call to disable a performance event + */ +static void __perf_event_disable(void *info) +{ + struct perf_event *event = info; + struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); + struct perf_event_context *ctx = event->ctx; + + /* + * If this is a per-task event, need to check whether this + * event's task is the current task on this cpu. + */ + if (ctx->task && cpuctx->task_ctx != ctx) + return; + + spin_lock(&ctx->lock); + + /* + * If the event is on, turn it off. + * If it is in error state, leave it in error state. + */ + if (event->state >= PERF_EVENT_STATE_INACTIVE) { + update_context_time(ctx); + update_group_times(event); + if (event == event->group_leader) + group_sched_out(event, cpuctx, ctx); + else + event_sched_out(event, cpuctx, ctx); + event->state = PERF_EVENT_STATE_OFF; + } + + spin_unlock(&ctx->lock); +} + +/* + * Disable a event. + * + * If event->ctx is a cloned context, callers must make sure that + * every task struct that event->ctx->task could possibly point to + * remains valid. This condition is satisifed when called through + * perf_event_for_each_child or perf_event_for_each because they + * hold the top-level event's child_mutex, so any descendant that + * goes to exit will block in sync_child_event. + * When called from perf_pending_event it's OK because event->ctx + * is the current context on this CPU and preemption is disabled, + * hence we can't get into perf_event_task_sched_out for this context. + */ +static void perf_event_disable(struct perf_event *event) +{ + struct perf_event_context *ctx = event->ctx; + struct task_struct *task = ctx->task; + + if (!task) { + /* + * Disable the event on the cpu that it's on + */ + smp_call_function_single(event->cpu, __perf_event_disable, + event, 1); + return; + } + + retry: + task_oncpu_function_call(task, __perf_event_disable, event); + + spin_lock_irq(&ctx->lock); + /* + * If the event is still active, we need to retry the cross-call. + */ + if (event->state == PERF_EVENT_STATE_ACTIVE) { + spin_unlock_irq(&ctx->lock); + goto retry; + } + + /* + * Since we have the lock this context can't be scheduled + * in, so we can change the state safely. + */ + if (event->state == PERF_EVENT_STATE_INACTIVE) { + update_group_times(event); + event->state = PERF_EVENT_STATE_OFF; + } + + spin_unlock_irq(&ctx->lock); +} + +static int +event_sched_in(struct perf_event *event, + struct perf_cpu_context *cpuctx, + struct perf_event_context *ctx, + int cpu) +{ + if (event->state <= PERF_EVENT_STATE_OFF) + return 0; + + event->state = PERF_EVENT_STATE_ACTIVE; + event->oncpu = cpu; /* TODO: put 'cpu' into cpuctx->cpu */ + /* + * The new state must be visible before we turn it on in the hardware: + */ + smp_wmb(); + + if (event->pmu->enable(event)) { + event->state = PERF_EVENT_STATE_INACTIVE; + event->oncpu = -1; + return -EAGAIN; + } + + event->tstamp_running += ctx->time - event->tstamp_stopped; + + if (!is_software_event(event)) + cpuctx->active_oncpu++; + ctx->nr_active++; + + if (event->attr.exclusive) + cpuctx->exclusive = 1; + + return 0; +} + +static int +group_sched_in(struct perf_event *group_event, + struct perf_cpu_context *cpuctx, + struct perf_event_context *ctx, + int cpu) +{ + struct perf_event *event, *partial_group; + int ret; + + if (group_event->state == PERF_EVENT_STATE_OFF) + return 0; + + ret = hw_perf_group_sched_in(group_event, cpuctx, ctx, cpu); + if (ret) + return ret < 0 ? ret : 0; + + if (event_sched_in(group_event, cpuctx, ctx, cpu)) + return -EAGAIN; + + /* + * Schedule in siblings as one group (if any): + */ + list_for_each_entry(event, &group_event->sibling_list, group_entry) { + if (event_sched_in(event, cpuctx, ctx, cpu)) { + partial_group = event; + goto group_error; + } + } + + return 0; + +group_error: + /* + * Groups can be scheduled in as one unit only, so undo any + * partial group before returning: + */ + list_for_each_entry(event, &group_event->sibling_list, group_entry) { + if (event == partial_group) + break; + event_sched_out(event, cpuctx, ctx); + } + event_sched_out(group_event, cpuctx, ctx); + + return -EAGAIN; +} + +/* + * Return 1 for a group consisting entirely of software events, + * 0 if the group contains any hardware events. + */ +static int is_software_only_group(struct perf_event *leader) +{ + struct perf_event *event; + + if (!is_software_event(leader)) + return 0; + + list_for_each_entry(event, &leader->sibling_list, group_entry) + if (!is_software_event(event)) + return 0; + + return 1; +} + +/* + * Work out whether we can put this event group on the CPU now. + */ +static int group_can_go_on(struct perf_event *event, + struct perf_cpu_context *cpuctx, + int can_add_hw) +{ + /* + * Groups consisting entirely of software events can always go on. + */ + if (is_software_only_group(event)) + return 1; + /* + * If an exclusive group is already on, no other hardware + * events can go on. + */ + if (cpuctx->exclusive) + return 0; + /* + * If this group is exclusive and there are already + * events on the CPU, it can't go on. + */ + if (event->attr.exclusive && cpuctx->active_oncpu) + return 0; + /* + * Otherwise, try to add it if all previous groups were able + * to go on. + */ + return can_add_hw; +} + +static void add_event_to_ctx(struct perf_event *event, + struct perf_event_context *ctx) +{ + list_add_event(event, ctx); + event->tstamp_enabled = ctx->time; + event->tstamp_running = ctx->time; + event->tstamp_stopped = ctx->time; +} + +/* + * Cross CPU call to install and enable a performance event + * + * Must be called with ctx->mutex held + */ +static void __perf_install_in_context(void *info) +{ + struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); + struct perf_event *event = info; + struct perf_event_context *ctx = event->ctx; + struct perf_event *leader = event->group_leader; + int cpu = smp_processor_id(); + int err; + + /* + * If this is a task context, we need to check whether it is + * the current task context of this cpu. If not it has been + * scheduled out before the smp call arrived. + * Or possibly this is the right context but it isn't + * on this cpu because it had no events. + */ + if (ctx->task && cpuctx->task_ctx != ctx) { + if (cpuctx->task_ctx || ctx->task != current) + return; + cpuctx->task_ctx = ctx; + } + + spin_lock(&ctx->lock); + ctx->is_active = 1; + update_context_time(ctx); + + /* + * Protect the list operation against NMI by disabling the + * events on a global level. NOP for non NMI based events. + */ + perf_disable(); + + add_event_to_ctx(event, ctx); + + /* + * Don't put the event on if it is disabled or if + * it is in a group and the group isn't on. + */ + if (event->state != PERF_EVENT_STATE_INACTIVE || + (leader != event && leader->state != PERF_EVENT_STATE_ACTIVE)) + goto unlock; + + /* + * An exclusive event can't go on if there are already active + * hardware events, and no hardware event can go on if there + * is already an exclusive event on. + */ + if (!group_can_go_on(event, cpuctx, 1)) + err = -EEXIST; + else + err = event_sched_in(event, cpuctx, ctx, cpu); + + if (err) { + /* + * This event couldn't go on. If it is in a group + * then we have to pull the whole group off. + * If the event group is pinned then put it in error state. + */ + if (leader != event) + group_sched_out(leader, cpuctx, ctx); + if (leader->attr.pinned) { + update_group_times(leader); + leader->state = PERF_EVENT_STATE_ERROR; + } + } + + if (!err && !ctx->task && cpuctx->max_pertask) + cpuctx->max_pertask--; + + unlock: + perf_enable(); + + spin_unlock(&ctx->lock); +} + +/* + * Attach a performance event to a context + * + * First we add the event to the list with the hardware enable bit + * in event->hw_config cleared. + * + * If the event is attached to a task which is on a CPU we use a smp + * call to enable it in the task context. The task might have been + * scheduled away, but we check this in the smp call again. + * + * Must be called with ctx->mutex held. + */ +static void +perf_install_in_context(struct perf_event_context *ctx, + struct perf_event *event, + int cpu) +{ + struct task_struct *task = ctx->task; + + if (!task) { + /* + * Per cpu events are installed via an smp call and + * the install is always sucessful. + */ + smp_call_function_single(cpu, __perf_install_in_context, + event, 1); + return; + } + +retry: + task_oncpu_function_call(task, __perf_install_in_context, + event); + + spin_lock_irq(&ctx->lock); + /* + * we need to retry the smp call. + */ + if (ctx->is_active && list_empty(&event->group_entry)) { + spin_unlock_irq(&ctx->lock); + goto retry; + } + + /* + * The lock prevents that this context is scheduled in so we + * can add the event safely, if it the call above did not + * succeed. + */ + if (list_empty(&event->group_entry)) + add_event_to_ctx(event, ctx); + spin_unlock_irq(&ctx->lock); +} + +/* + * Put a event into inactive state and update time fields. + * Enabling the leader of a group effectively enables all + * the group members that aren't explicitly disabled, so we + * have to update their ->tstamp_enabled also. + * Note: this works for group members as well as group leaders + * since the non-leader members' sibling_lists will be empty. + */ +static void __perf_event_mark_enabled(struct perf_event *event, + struct perf_event_context *ctx) +{ + struct perf_event *sub; + + event->state = PERF_EVENT_STATE_INACTIVE; + event->tstamp_enabled = ctx->time - event->total_time_enabled; + list_for_each_entry(sub, &event->sibling_list, group_entry) + if (sub->state >= PERF_EVENT_STATE_INACTIVE) + sub->tstamp_enabled = + ctx->time - sub->total_time_enabled; +} + +/* + * Cross CPU call to enable a performance event + */ +static void __perf_event_enable(void *info) +{ + struct perf_event *event = info; + struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); + struct perf_event_context *ctx = event->ctx; + struct perf_event *leader = event->group_leader; + int err; + + /* + * If this is a per-task event, need to check whether this + * event's task is the current task on this cpu. + */ + if (ctx->task && cpuctx->task_ctx != ctx) { + if (cpuctx->task_ctx || ctx->task != current) + return; + cpuctx->task_ctx = ctx; + } + + spin_lock(&ctx->lock); + ctx->is_active = 1; + update_context_time(ctx); + + if (event->state >= PERF_EVENT_STATE_INACTIVE) + goto unlock; + __perf_event_mark_enabled(event, ctx); + + /* + * If the event is in a group and isn't the group leader, + * then don't put it on unless the group is on. + */ + if (leader != event && leader->state != PERF_EVENT_STATE_ACTIVE) + goto unlock; + + if (!group_can_go_on(event, cpuctx, 1)) { + err = -EEXIST; + } else { + perf_disable(); + if (event == leader) + err = group_sched_in(event, cpuctx, ctx, + smp_processor_id()); + else + err = event_sched_in(event, cpuctx, ctx, + smp_processor_id()); + perf_enable(); + } + + if (err) { + /* + * If this event can't go on and it's part of a + * group, then the whole group has to come off. + */ + if (leader != event) + group_sched_out(leader, cpuctx, ctx); + if (leader->attr.pinned) { + update_group_times(leader); + leader->state = PERF_EVENT_STATE_ERROR; + } + } + + unlock: + spin_unlock(&ctx->lock); +} + +/* + * Enable a event. + * + * If event->ctx is a cloned context, callers must make sure that + * every task struct that event->ctx->task could possibly point to + * remains valid. This condition is satisfied when called through + * perf_event_for_each_child or perf_event_for_each as described + * for perf_event_disable. + */ +static void perf_event_enable(struct perf_event *event) +{ + struct perf_event_context *ctx = event->ctx; + struct task_struct *task = ctx->task; + + if (!task) { + /* + * Enable the event on the cpu that it's on + */ + smp_call_function_single(event->cpu, __perf_event_enable, + event, 1); + return; + } + + spin_lock_irq(&ctx->lock); + if (event->state >= PERF_EVENT_STATE_INACTIVE) + goto out; + + /* + * If the event is in error state, clear that first. + * That way, if we see the event in error state below, we + * know that it has gone back into error state, as distinct + * from the task having been scheduled away before the + * cross-call arrived. + */ + if (event->state == PERF_EVENT_STATE_ERROR) + event->state = PERF_EVENT_STATE_OFF; + + retry: + spin_unlock_irq(&ctx->lock); + task_oncpu_function_call(task, __perf_event_enable, event); + + spin_lock_irq(&ctx->lock); + + /* + * If the context is active and the event is still off, + * we need to retry the cross-call. + */ + if (ctx->is_active && event->state == PERF_EVENT_STATE_OFF) + goto retry; + + /* + * Since we have the lock this context can't be scheduled + * in, so we can change the state safely. + */ + if (event->state == PERF_EVENT_STATE_OFF) + __perf_event_mark_enabled(event, ctx); + + out: + spin_unlock_irq(&ctx->lock); +} + +static int perf_event_refresh(struct perf_event *event, int refresh) +{ + /* + * not supported on inherited events + */ + if (event->attr.inherit) + return -EINVAL; + + atomic_add(refresh, &event->event_limit); + perf_event_enable(event); + + return 0; +} + +void __perf_event_sched_out(struct perf_event_context *ctx, + struct perf_cpu_context *cpuctx) +{ + struct perf_event *event; + + spin_lock(&ctx->lock); + ctx->is_active = 0; + if (likely(!ctx->nr_events)) + goto out; + update_context_time(ctx); + + perf_disable(); + if (ctx->nr_active) { + list_for_each_entry(event, &ctx->group_list, group_entry) { + if (event != event->group_leader) + event_sched_out(event, cpuctx, ctx); + else + group_sched_out(event, cpuctx, ctx); + } + } + perf_enable(); + out: + spin_unlock(&ctx->lock); +} + +/* + * Test whether two contexts are equivalent, i.e. whether they + * have both been cloned from the same version of the same context + * and they both have the same number of enabled events. + * If the number of enabled events is the same, then the set + * of enabled events should be the same, because these are both + * inherited contexts, therefore we can't access individual events + * in them directly with an fd; we can only enable/disable all + * events via prctl, or enable/disable all events in a family + * via ioctl, which will have the same effect on both contexts. + */ +static int context_equiv(struct perf_event_context *ctx1, + struct perf_event_context *ctx2) +{ + return ctx1->parent_ctx && ctx1->parent_ctx == ctx2->parent_ctx + && ctx1->parent_gen == ctx2->parent_gen + && !ctx1->pin_count && !ctx2->pin_count; +} + +static void __perf_event_read(void *event); + +static void __perf_event_sync_stat(struct perf_event *event, + struct perf_event *next_event) +{ + u64 value; + + if (!event->attr.inherit_stat) + return; + + /* + * Update the event value, we cannot use perf_event_read() + * because we're in the middle of a context switch and have IRQs + * disabled, which upsets smp_call_function_single(), however + * we know the event must be on the current CPU, therefore we + * don't need to use it. + */ + switch (event->state) { + case PERF_EVENT_STATE_ACTIVE: + __perf_event_read(event); + break; + + case PERF_EVENT_STATE_INACTIVE: + update_event_times(event); + break; + + default: + break; + } + + /* + * In order to keep per-task stats reliable we need to flip the event + * values when we flip the contexts. + */ + value = atomic64_read(&next_event->count); + value = atomic64_xchg(&event->count, value); + atomic64_set(&next_event->count, value); + + swap(event->total_time_enabled, next_event->total_time_enabled); + swap(event->total_time_running, next_event->total_time_running); + + /* + * Since we swizzled the values, update the user visible data too. + */ + perf_event_update_userpage(event); + perf_event_update_userpage(next_event); +} + +#define list_next_entry(pos, member) \ + list_entry(pos->member.next, typeof(*pos), member) + +static void perf_event_sync_stat(struct perf_event_context *ctx, + struct perf_event_context *next_ctx) +{ + struct perf_event *event, *next_event; + + if (!ctx->nr_stat) + return; + + event = list_first_entry(&ctx->event_list, + struct perf_event, event_entry); + + next_event = list_first_entry(&next_ctx->event_list, + struct perf_event, event_entry); + + while (&event->event_entry != &ctx->event_list && + &next_event->event_entry != &next_ctx->event_list) { + + __perf_event_sync_stat(event, next_event); + + event = list_next_entry(event, event_entry); + next_event = list_next_entry(next_event, event_entry); + } +} + +/* + * Called from scheduler to remove the events of the current task, + * with interrupts disabled. + * + * We stop each event and update the event value in event->count. + * + * This does not protect us against NMI, but disable() + * sets the disabled bit in the control field of event _before_ + * accessing the event control register. If a NMI hits, then it will + * not restart the event. + */ +void perf_event_task_sched_out(struct task_struct *task, + struct task_struct *next, int cpu) +{ + struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu); + struct perf_event_context *ctx = task->perf_event_ctxp; + struct perf_event_context *next_ctx; + struct perf_event_context *parent; + struct pt_regs *regs; + int do_switch = 1; + + regs = task_pt_regs(task); + perf_sw_event(PERF_COUNT_SW_CONTEXT_SWITCHES, 1, 1, regs, 0); + + if (likely(!ctx || !cpuctx->task_ctx)) + return; + + update_context_time(ctx); + + rcu_read_lock(); + parent = rcu_dereference(ctx->parent_ctx); + next_ctx = next->perf_event_ctxp; + if (parent && next_ctx && + rcu_dereference(next_ctx->parent_ctx) == parent) { + /* + * Looks like the two contexts are clones, so we might be + * able to optimize the context switch. We lock both + * contexts and check that they are clones under the + * lock (including re-checking that neither has been + * uncloned in the meantime). It doesn't matter which + * order we take the locks because no other cpu could + * be trying to lock both of these tasks. + */ + spin_lock(&ctx->lock); + spin_lock_nested(&next_ctx->lock, SINGLE_DEPTH_NESTING); + if (context_equiv(ctx, next_ctx)) { + /* + * XXX do we need a memory barrier of sorts + * wrt to rcu_dereference() of perf_event_ctxp + */ + task->perf_event_ctxp = next_ctx; + next->perf_event_ctxp = ctx; + ctx->task = next; + next_ctx->task = task; + do_switch = 0; + + perf_event_sync_stat(ctx, next_ctx); + } + spin_unlock(&next_ctx->lock); + spin_unlock(&ctx->lock); + } + rcu_read_unlock(); + + if (do_switch) { + __perf_event_sched_out(ctx, cpuctx); + cpuctx->task_ctx = NULL; + } +} + +/* + * Called with IRQs disabled + */ +static void __perf_event_task_sched_out(struct perf_event_context *ctx) +{ + struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); + + if (!cpuctx->task_ctx) + return; + + if (WARN_ON_ONCE(ctx != cpuctx->task_ctx)) + return; + + __perf_event_sched_out(ctx, cpuctx); + cpuctx->task_ctx = NULL; +} + +/* + * Called with IRQs disabled + */ +static void perf_event_cpu_sched_out(struct perf_cpu_context *cpuctx) +{ + __perf_event_sched_out(&cpuctx->ctx, cpuctx); +} + +static void +__perf_event_sched_in(struct perf_event_context *ctx, + struct perf_cpu_context *cpuctx, int cpu) +{ + struct perf_event *event; + int can_add_hw = 1; + + spin_lock(&ctx->lock); + ctx->is_active = 1; + if (likely(!ctx->nr_events)) + goto out; + + ctx->timestamp = perf_clock(); + + perf_disable(); + + /* + * First go through the list and put on any pinned groups + * in order to give them the best chance of going on. + */ + list_for_each_entry(event, &ctx->group_list, group_entry) { + if (event->state <= PERF_EVENT_STATE_OFF || + !event->attr.pinned) + continue; + if (event->cpu != -1 && event->cpu != cpu) + continue; + + if (event != event->group_leader) + event_sched_in(event, cpuctx, ctx, cpu); + else { + if (group_can_go_on(event, cpuctx, 1)) + group_sched_in(event, cpuctx, ctx, cpu); + } + + /* + * If this pinned group hasn't been scheduled, + * put it in error state. + */ + if (event->state == PERF_EVENT_STATE_INACTIVE) { + update_group_times(event); + event->state = PERF_EVENT_STATE_ERROR; + } + } + + list_for_each_entry(event, &ctx->group_list, group_entry) { + /* + * Ignore events in OFF or ERROR state, and + * ignore pinned events since we did them already. + */ + if (event->state <= PERF_EVENT_STATE_OFF || + event->attr.pinned) + continue; + + /* + * Listen to the 'cpu' scheduling filter constraint + * of events: + */ + if (event->cpu != -1 && event->cpu != cpu) + continue; + + if (event != event->group_leader) { + if (event_sched_in(event, cpuctx, ctx, cpu)) + can_add_hw = 0; + } else { + if (group_can_go_on(event, cpuctx, can_add_hw)) { + if (group_sched_in(event, cpuctx, ctx, cpu)) + can_add_hw = 0; + } + } + } + perf_enable(); + out: + spin_unlock(&ctx->lock); +} + +/* + * Called from scheduler to add the events of the current task + * with interrupts disabled. + * + * We restore the event value and then enable it. + * + * This does not protect us against NMI, but enable() + * sets the enabled bit in the control field of event _before_ + * accessing the event control register. If a NMI hits, then it will + * keep the event running. + */ +void perf_event_task_sched_in(struct task_struct *task, int cpu) +{ + struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu); + struct perf_event_context *ctx = task->perf_event_ctxp; + + if (likely(!ctx)) + return; + if (cpuctx->task_ctx == ctx) + return; + __perf_event_sched_in(ctx, cpuctx, cpu); + cpuctx->task_ctx = ctx; +} + +static void perf_event_cpu_sched_in(struct perf_cpu_context *cpuctx, int cpu) +{ + struct perf_event_context *ctx = &cpuctx->ctx; + + __perf_event_sched_in(ctx, cpuctx, cpu); +} + +#define MAX_INTERRUPTS (~0ULL) + +static void perf_log_throttle(struct perf_event *event, int enable); + +static void perf_adjust_period(struct perf_event *event, u64 events) +{ + struct hw_perf_event *hwc = &event->hw; + u64 period, sample_period; + s64 delta; + + events *= hwc->sample_period; + period = div64_u64(events, event->attr.sample_freq); + + delta = (s64)(period - hwc->sample_period); + delta = (delta + 7) / 8; /* low pass filter */ + + sample_period = hwc->sample_period + delta; + + if (!sample_period) + sample_period = 1; + + hwc->sample_period = sample_period; +} + +static void perf_ctx_adjust_freq(struct perf_event_context *ctx) +{ + struct perf_event *event; + struct hw_perf_event *hwc; + u64 interrupts, freq; + + spin_lock(&ctx->lock); + list_for_each_entry(event, &ctx->group_list, group_entry) { + if (event->state != PERF_EVENT_STATE_ACTIVE) + continue; + + hwc = &event->hw; + + interrupts = hwc->interrupts; + hwc->interrupts = 0; + + /* + * unthrottle events on the tick + */ + if (interrupts == MAX_INTERRUPTS) { + perf_log_throttle(event, 1); + event->pmu->unthrottle(event); + interrupts = 2*sysctl_perf_event_sample_rate/HZ; + } + + if (!event->attr.freq || !event->attr.sample_freq) + continue; + + /* + * if the specified freq < HZ then we need to skip ticks + */ + if (event->attr.sample_freq < HZ) { + freq = event->attr.sample_freq; + + hwc->freq_count += freq; + hwc->freq_interrupts += interrupts; + + if (hwc->freq_count < HZ) + continue; + + interrupts = hwc->freq_interrupts; + hwc->freq_interrupts = 0; + hwc->freq_count -= HZ; + } else + freq = HZ; + + perf_adjust_period(event, freq * interrupts); + + /* + * In order to avoid being stalled by an (accidental) huge + * sample period, force reset the sample period if we didn't + * get any events in this freq period. + */ + if (!interrupts) { + perf_disable(); + event->pmu->disable(event); + atomic64_set(&hwc->period_left, 0); + event->pmu->enable(event); + perf_enable(); + } + } + spin_unlock(&ctx->lock); +} + +/* + * Round-robin a context's events: + */ +static void rotate_ctx(struct perf_event_context *ctx) +{ + struct perf_event *event; + + if (!ctx->nr_events) + return; + + spin_lock(&ctx->lock); + /* + * Rotate the first entry last (works just fine for group events too): + */ + perf_disable(); + list_for_each_entry(event, &ctx->group_list, group_entry) { + list_move_tail(&event->group_entry, &ctx->group_list); + break; + } + perf_enable(); + + spin_unlock(&ctx->lock); +} + +void perf_event_task_tick(struct task_struct *curr, int cpu) +{ + struct perf_cpu_context *cpuctx; + struct perf_event_context *ctx; + + if (!atomic_read(&nr_events)) + return; + + cpuctx = &per_cpu(perf_cpu_context, cpu); + ctx = curr->perf_event_ctxp; + + perf_ctx_adjust_freq(&cpuctx->ctx); + if (ctx) + perf_ctx_adjust_freq(ctx); + + perf_event_cpu_sched_out(cpuctx); + if (ctx) + __perf_event_task_sched_out(ctx); + + rotate_ctx(&cpuctx->ctx); + if (ctx) + rotate_ctx(ctx); + + perf_event_cpu_sched_in(cpuctx, cpu); + if (ctx) + perf_event_task_sched_in(curr, cpu); +} + +/* + * Enable all of a task's events that have been marked enable-on-exec. + * This expects task == current. + */ +static void perf_event_enable_on_exec(struct task_struct *task) +{ + struct perf_event_context *ctx; + struct perf_event *event; + unsigned long flags; + int enabled = 0; + + local_irq_save(flags); + ctx = task->perf_event_ctxp; + if (!ctx || !ctx->nr_events) + goto out; + + __perf_event_task_sched_out(ctx); + + spin_lock(&ctx->lock); + + list_for_each_entry(event, &ctx->group_list, group_entry) { + if (!event->attr.enable_on_exec) + continue; + event->attr.enable_on_exec = 0; + if (event->state >= PERF_EVENT_STATE_INACTIVE) + continue; + __perf_event_mark_enabled(event, ctx); + enabled = 1; + } + + /* + * Unclone this context if we enabled any event. + */ + if (enabled) + unclone_ctx(ctx); + + spin_unlock(&ctx->lock); + + perf_event_task_sched_in(task, smp_processor_id()); + out: + local_irq_restore(flags); +} + +/* + * Cross CPU call to read the hardware event + */ +static void __perf_event_read(void *info) +{ + struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); + struct perf_event *event = info; + struct perf_event_context *ctx = event->ctx; + unsigned long flags; + + /* + * If this is a task context, we need to check whether it is + * the current task context of this cpu. If not it has been + * scheduled out before the smp call arrived. In that case + * event->count would have been updated to a recent sample + * when the event was scheduled out. + */ + if (ctx->task && cpuctx->task_ctx != ctx) + return; + + local_irq_save(flags); + if (ctx->is_active) + update_context_time(ctx); + event->pmu->read(event); + update_event_times(event); + local_irq_restore(flags); +} + +static u64 perf_event_read(struct perf_event *event) +{ + /* + * If event is enabled and currently active on a CPU, update the + * value in the event structure: + */ + if (event->state == PERF_EVENT_STATE_ACTIVE) { + smp_call_function_single(event->oncpu, + __perf_event_read, event, 1); + } else if (event->state == PERF_EVENT_STATE_INACTIVE) { + update_event_times(event); + } + + return atomic64_read(&event->count); +} + +/* + * Initialize the perf_event context in a task_struct: + */ +static void +__perf_event_init_context(struct perf_event_context *ctx, + struct task_struct *task) +{ + memset(ctx, 0, sizeof(*ctx)); + spin_lock_init(&ctx->lock); + mutex_init(&ctx->mutex); + INIT_LIST_HEAD(&ctx->group_list); + INIT_LIST_HEAD(&ctx->event_list); + atomic_set(&ctx->refcount, 1); + ctx->task = task; +} + +static struct perf_event_context *find_get_context(pid_t pid, int cpu) +{ + struct perf_event_context *ctx; + struct perf_cpu_context *cpuctx; + struct task_struct *task; + unsigned long flags; + int err; + + /* + * If cpu is not a wildcard then this is a percpu event: + */ + if (cpu != -1) { + /* Must be root to operate on a CPU event: */ + if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN)) + return ERR_PTR(-EACCES); + + if (cpu < 0 || cpu > num_possible_cpus()) + return ERR_PTR(-EINVAL); + + /* + * We could be clever and allow to attach a event to an + * offline CPU and activate it when the CPU comes up, but + * that's for later. + */ + if (!cpu_isset(cpu, cpu_online_map)) + return ERR_PTR(-ENODEV); + + cpuctx = &per_cpu(perf_cpu_context, cpu); + ctx = &cpuctx->ctx; + get_ctx(ctx); + + return ctx; + } + + rcu_read_lock(); + if (!pid) + task = current; + else + task = find_task_by_vpid(pid); + if (task) + get_task_struct(task); + rcu_read_unlock(); + + if (!task) + return ERR_PTR(-ESRCH); + + /* + * Can't attach events to a dying task. + */ + err = -ESRCH; + if (task->flags & PF_EXITING) + goto errout; + + /* Reuse ptrace permission checks for now. */ + err = -EACCES; + if (!ptrace_may_access(task, PTRACE_MODE_READ)) + goto errout; + + retry: + ctx = perf_lock_task_context(task, &flags); + if (ctx) { + unclone_ctx(ctx); + spin_unlock_irqrestore(&ctx->lock, flags); + } + + if (!ctx) { + ctx = kmalloc(sizeof(struct perf_event_context), GFP_KERNEL); + err = -ENOMEM; + if (!ctx) + goto errout; + __perf_event_init_context(ctx, task); + get_ctx(ctx); + if (cmpxchg(&task->perf_event_ctxp, NULL, ctx)) { + /* + * We raced with some other task; use + * the context they set. + */ + kfree(ctx); + goto retry; + } + get_task_struct(task); + } + + put_task_struct(task); + return ctx; + + errout: + put_task_struct(task); + return ERR_PTR(err); +} + +static void free_event_rcu(struct rcu_head *head) +{ + struct perf_event *event; + + event = container_of(head, struct perf_event, rcu_head); + if (event->ns) + put_pid_ns(event->ns); + kfree(event); +} + +static void perf_pending_sync(struct perf_event *event); + +static void free_event(struct perf_event *event) +{ + perf_pending_sync(event); + + if (!event->parent) { + atomic_dec(&nr_events); + if (event->attr.mmap) + atomic_dec(&nr_mmap_events); + if (event->attr.comm) + atomic_dec(&nr_comm_events); + if (event->attr.task) + atomic_dec(&nr_task_events); + } + + if (event->output) { + fput(event->output->filp); + event->output = NULL; + } + + if (event->destroy) + event->destroy(event); + + put_ctx(event->ctx); + call_rcu(&event->rcu_head, free_event_rcu); +} + +/* + * Called when the last reference to the file is gone. + */ +static int perf_release(struct inode *inode, struct file *file) +{ + struct perf_event *event = file->private_data; + struct perf_event_context *ctx = event->ctx; + + file->private_data = NULL; + + WARN_ON_ONCE(ctx->parent_ctx); + mutex_lock(&ctx->mutex); + perf_event_remove_from_context(event); + mutex_unlock(&ctx->mutex); + + mutex_lock(&event->owner->perf_event_mutex); + list_del_init(&event->owner_entry); + mutex_unlock(&event->owner->perf_event_mutex); + put_task_struct(event->owner); + + free_event(event); + + return 0; +} + +static int perf_event_read_size(struct perf_event *event) +{ + int entry = sizeof(u64); /* value */ + int size = 0; + int nr = 1; + + if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) + size += sizeof(u64); + + if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) + size += sizeof(u64); + + if (event->attr.read_format & PERF_FORMAT_ID) + entry += sizeof(u64); + + if (event->attr.read_format & PERF_FORMAT_GROUP) { + nr += event->group_leader->nr_siblings; + size += sizeof(u64); + } + + size += entry * nr; + + return size; +} + +static u64 perf_event_read_value(struct perf_event *event) +{ + struct perf_event *child; + u64 total = 0; + + total += perf_event_read(event); + list_for_each_entry(child, &event->child_list, child_list) + total += perf_event_read(child); + + return total; +} + +static int perf_event_read_entry(struct perf_event *event, + u64 read_format, char __user *buf) +{ + int n = 0, count = 0; + u64 values[2]; + + values[n++] = perf_event_read_value(event); + if (read_format & PERF_FORMAT_ID) + values[n++] = primary_event_id(event); + + count = n * sizeof(u64); + + if (copy_to_user(buf, values, count)) + return -EFAULT; + + return count; +} + +static int perf_event_read_group(struct perf_event *event, + u64 read_format, char __user *buf) +{ + struct perf_event *leader = event->group_leader, *sub; + int n = 0, size = 0, err = -EFAULT; + u64 values[3]; + + values[n++] = 1 + leader->nr_siblings; + if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) { + values[n++] = leader->total_time_enabled + + atomic64_read(&leader->child_total_time_enabled); + } + if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) { + values[n++] = leader->total_time_running + + atomic64_read(&leader->child_total_time_running); + } + + size = n * sizeof(u64); + + if (copy_to_user(buf, values, size)) + return -EFAULT; + + err = perf_event_read_entry(leader, read_format, buf + size); + if (err < 0) + return err; + + size += err; + + list_for_each_entry(sub, &leader->sibling_list, group_entry) { + err = perf_event_read_entry(sub, read_format, + buf + size); + if (err < 0) + return err; + + size += err; + } + + return size; +} + +static int perf_event_read_one(struct perf_event *event, + u64 read_format, char __user *buf) +{ + u64 values[4]; + int n = 0; + + values[n++] = perf_event_read_value(event); + if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) { + values[n++] = event->total_time_enabled + + atomic64_read(&event->child_total_time_enabled); + } + if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) { + values[n++] = event->total_time_running + + atomic64_read(&event->child_total_time_running); + } + if (read_format & PERF_FORMAT_ID) + values[n++] = primary_event_id(event); + + if (copy_to_user(buf, values, n * sizeof(u64))) + return -EFAULT; + + return n * sizeof(u64); +} + +/* + * Read the performance event - simple non blocking version for now + */ +static ssize_t +perf_read_hw(struct perf_event *event, char __user *buf, size_t count) +{ + u64 read_format = event->attr.read_format; + int ret; + + /* + * Return end-of-file for a read on a event that is in + * error state (i.e. because it was pinned but it couldn't be + * scheduled on to the CPU at some point). + */ + if (event->state == PERF_EVENT_STATE_ERROR) + return 0; + + if (count < perf_event_read_size(event)) + return -ENOSPC; + + WARN_ON_ONCE(event->ctx->parent_ctx); + mutex_lock(&event->child_mutex); + if (read_format & PERF_FORMAT_GROUP) + ret = perf_event_read_group(event, read_format, buf); + else + ret = perf_event_read_one(event, read_format, buf); + mutex_unlock(&event->child_mutex); + + return ret; +} + +static ssize_t +perf_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) +{ + struct perf_event *event = file->private_data; + + return perf_read_hw(event, buf, count); +} + +static unsigned int perf_poll(struct file *file, poll_table *wait) +{ + struct perf_event *event = file->private_data; + struct perf_mmap_data *data; + unsigned int events = POLL_HUP; + + rcu_read_lock(); + data = rcu_dereference(event->data); + if (data) + events = atomic_xchg(&data->poll, 0); + rcu_read_unlock(); + + poll_wait(file, &event->waitq, wait); + + return events; +} + +static void perf_event_reset(struct perf_event *event) +{ + (void)perf_event_read(event); + atomic64_set(&event->count, 0); + perf_event_update_userpage(event); +} + +/* + * Holding the top-level event's child_mutex means that any + * descendant process that has inherited this event will block + * in sync_child_event if it goes to exit, thus satisfying the + * task existence requirements of perf_event_enable/disable. + */ +static void perf_event_for_each_child(struct perf_event *event, + void (*func)(struct perf_event *)) +{ + struct perf_event *child; + + WARN_ON_ONCE(event->ctx->parent_ctx); + mutex_lock(&event->child_mutex); + func(event); + list_for_each_entry(child, &event->child_list, child_list) + func(child); + mutex_unlock(&event->child_mutex); +} + +static void perf_event_for_each(struct perf_event *event, + void (*func)(struct perf_event *)) +{ + struct perf_event_context *ctx = event->ctx; + struct perf_event *sibling; + + WARN_ON_ONCE(ctx->parent_ctx); + mutex_lock(&ctx->mutex); + event = event->group_leader; + + perf_event_for_each_child(event, func); + func(event); + list_for_each_entry(sibling, &event->sibling_list, group_entry) + perf_event_for_each_child(event, func); + mutex_unlock(&ctx->mutex); +} + +static int perf_event_period(struct perf_event *event, u64 __user *arg) +{ + struct perf_event_context *ctx = event->ctx; + unsigned long size; + int ret = 0; + u64 value; + + if (!event->attr.sample_period) + return -EINVAL; + + size = copy_from_user(&value, arg, sizeof(value)); + if (size != sizeof(value)) + return -EFAULT; + + if (!value) + return -EINVAL; + + spin_lock_irq(&ctx->lock); + if (event->attr.freq) { + if (value > sysctl_perf_event_sample_rate) { + ret = -EINVAL; + goto unlock; + } + + event->attr.sample_freq = value; + } else { + event->attr.sample_period = value; + event->hw.sample_period = value; + } +unlock: + spin_unlock_irq(&ctx->lock); + + return ret; +} + +int perf_event_set_output(struct perf_event *event, int output_fd); + +static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg) +{ + struct perf_event *event = file->private_data; + void (*func)(struct perf_event *); + u32 flags = arg; + + switch (cmd) { + case PERF_EVENT_IOC_ENABLE: + func = perf_event_enable; + break; + case PERF_EVENT_IOC_DISABLE: + func = perf_event_disable; + break; + case PERF_EVENT_IOC_RESET: + func = perf_event_reset; + break; + + case PERF_EVENT_IOC_REFRESH: + return perf_event_refresh(event, arg); + + case PERF_EVENT_IOC_PERIOD: + return perf_event_period(event, (u64 __user *)arg); + + case PERF_EVENT_IOC_SET_OUTPUT: + return perf_event_set_output(event, arg); + + default: + return -ENOTTY; + } + + if (flags & PERF_IOC_FLAG_GROUP) + perf_event_for_each(event, func); + else + perf_event_for_each_child(event, func); + + return 0; +} + +int perf_event_task_enable(void) +{ + struct perf_event *event; + + mutex_lock(¤t->perf_event_mutex); + list_for_each_entry(event, ¤t->perf_event_list, owner_entry) + perf_event_for_each_child(event, perf_event_enable); + mutex_unlock(¤t->perf_event_mutex); + + return 0; +} + +int perf_event_task_disable(void) +{ + struct perf_event *event; + + mutex_lock(¤t->perf_event_mutex); + list_for_each_entry(event, ¤t->perf_event_list, owner_entry) + perf_event_for_each_child(event, perf_event_disable); + mutex_unlock(¤t->perf_event_mutex); + + return 0; +} + +#ifndef PERF_EVENT_INDEX_OFFSET +# define PERF_EVENT_INDEX_OFFSET 0 +#endif + +static int perf_event_index(struct perf_event *event) +{ + if (event->state != PERF_EVENT_STATE_ACTIVE) + return 0; + + return event->hw.idx + 1 - PERF_EVENT_INDEX_OFFSET; +} + +/* + * Callers need to ensure there can be no nesting of this function, otherwise + * the seqlock logic goes bad. We can not serialize this because the arch + * code calls this from NMI context. + */ +void perf_event_update_userpage(struct perf_event *event) +{ + struct perf_event_mmap_page *userpg; + struct perf_mmap_data *data; + + rcu_read_lock(); + data = rcu_dereference(event->data); + if (!data) + goto unlock; + + userpg = data->user_page; + + /* + * Disable preemption so as to not let the corresponding user-space + * spin too long if we get preempted. + */ + preempt_disable(); + ++userpg->lock; + barrier(); + userpg->index = perf_event_index(event); + userpg->offset = atomic64_read(&event->count); + if (event->state == PERF_EVENT_STATE_ACTIVE) + userpg->offset -= atomic64_read(&event->hw.prev_count); + + userpg->time_enabled = event->total_time_enabled + + atomic64_read(&event->child_total_time_enabled); + + userpg->time_running = event->total_time_running + + atomic64_read(&event->child_total_time_running); + + barrier(); + ++userpg->lock; + preempt_enable(); +unlock: + rcu_read_unlock(); +} + +static int perf_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) +{ + struct perf_event *event = vma->vm_file->private_data; + struct perf_mmap_data *data; + int ret = VM_FAULT_SIGBUS; + + if (vmf->flags & FAULT_FLAG_MKWRITE) { + if (vmf->pgoff == 0) + ret = 0; + return ret; + } + + rcu_read_lock(); + data = rcu_dereference(event->data); + if (!data) + goto unlock; + + if (vmf->pgoff == 0) { + vmf->page = virt_to_page(data->user_page); + } else { + int nr = vmf->pgoff - 1; + + if ((unsigned)nr > data->nr_pages) + goto unlock; + + if (vmf->flags & FAULT_FLAG_WRITE) + goto unlock; + + vmf->page = virt_to_page(data->data_pages[nr]); + } + + get_page(vmf->page); + vmf->page->mapping = vma->vm_file->f_mapping; + vmf->page->index = vmf->pgoff; + + ret = 0; +unlock: + rcu_read_unlock(); + + return ret; +} + +static int perf_mmap_data_alloc(struct perf_event *event, int nr_pages) +{ + struct perf_mmap_data *data; + unsigned long size; + int i; + + WARN_ON(atomic_read(&event->mmap_count)); + + size = sizeof(struct perf_mmap_data); + size += nr_pages * sizeof(void *); + + data = kzalloc(size, GFP_KERNEL); + if (!data) + goto fail; + + data->user_page = (void *)get_zeroed_page(GFP_KERNEL); + if (!data->user_page) + goto fail_user_page; + + for (i = 0; i < nr_pages; i++) { + data->data_pages[i] = (void *)get_zeroed_page(GFP_KERNEL); + if (!data->data_pages[i]) + goto fail_data_pages; + } + + data->nr_pages = nr_pages; + atomic_set(&data->lock, -1); + + if (event->attr.watermark) { + data->watermark = min_t(long, PAGE_SIZE * nr_pages, + event->attr.wakeup_watermark); + } + if (!data->watermark) + data->watermark = max(PAGE_SIZE, PAGE_SIZE * nr_pages / 4); + + rcu_assign_pointer(event->data, data); + + return 0; + +fail_data_pages: + for (i--; i >= 0; i--) + free_page((unsigned long)data->data_pages[i]); + + free_page((unsigned long)data->user_page); + +fail_user_page: + kfree(data); + +fail: + return -ENOMEM; +} + +static void perf_mmap_free_page(unsigned long addr) +{ + struct page *page = virt_to_page((void *)addr); + + page->mapping = NULL; + __free_page(page); +} + +static void __perf_mmap_data_free(struct rcu_head *rcu_head) +{ + struct perf_mmap_data *data; + int i; + + data = container_of(rcu_head, struct perf_mmap_data, rcu_head); + + perf_mmap_free_page((unsigned long)data->user_page); + for (i = 0; i < data->nr_pages; i++) + perf_mmap_free_page((unsigned long)data->data_pages[i]); + + kfree(data); +} + +static void perf_mmap_data_free(struct perf_event *event) +{ + struct perf_mmap_data *data = event->data; + + WARN_ON(atomic_read(&event->mmap_count)); + + rcu_assign_pointer(event->data, NULL); + call_rcu(&data->rcu_head, __perf_mmap_data_free); +} + +static void perf_mmap_open(struct vm_area_struct *vma) +{ + struct perf_event *event = vma->vm_file->private_data; + + atomic_inc(&event->mmap_count); +} + +static void perf_mmap_close(struct vm_area_struct *vma) +{ + struct perf_event *event = vma->vm_file->private_data; + + WARN_ON_ONCE(event->ctx->parent_ctx); + if (atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex)) { + struct user_struct *user = current_user(); + + atomic_long_sub(event->data->nr_pages + 1, &user->locked_vm); + vma->vm_mm->locked_vm -= event->data->nr_locked; + perf_mmap_data_free(event); + mutex_unlock(&event->mmap_mutex); + } +} + +static struct vm_operations_struct perf_mmap_vmops = { + .open = perf_mmap_open, + .close = perf_mmap_close, + .fault = perf_mmap_fault, + .page_mkwrite = perf_mmap_fault, +}; + +static int perf_mmap(struct file *file, struct vm_area_struct *vma) +{ + struct perf_event *event = file->private_data; + unsigned long user_locked, user_lock_limit; + struct user_struct *user = current_user(); + unsigned long locked, lock_limit; + unsigned long vma_size; + unsigned long nr_pages; + long user_extra, extra; + int ret = 0; + + if (!(vma->vm_flags & VM_SHARED)) + return -EINVAL; + + vma_size = vma->vm_end - vma->vm_start; + nr_pages = (vma_size / PAGE_SIZE) - 1; + + /* + * If we have data pages ensure they're a power-of-two number, so we + * can do bitmasks instead of modulo. + */ + if (nr_pages != 0 && !is_power_of_2(nr_pages)) + return -EINVAL; + + if (vma_size != PAGE_SIZE * (1 + nr_pages)) + return -EINVAL; + + if (vma->vm_pgoff != 0) + return -EINVAL; + + WARN_ON_ONCE(event->ctx->parent_ctx); + mutex_lock(&event->mmap_mutex); + if (event->output) { + ret = -EINVAL; + goto unlock; + } + + if (atomic_inc_not_zero(&event->mmap_count)) { + if (nr_pages != event->data->nr_pages) + ret = -EINVAL; + goto unlock; + } + + user_extra = nr_pages + 1; + user_lock_limit = sysctl_perf_event_mlock >> (PAGE_SHIFT - 10); + + /* + * Increase the limit linearly with more CPUs: + */ + user_lock_limit *= num_online_cpus(); + + user_locked = atomic_long_read(&user->locked_vm) + user_extra; + + extra = 0; + if (user_locked > user_lock_limit) + extra = user_locked - user_lock_limit; + + lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur; + lock_limit >>= PAGE_SHIFT; + locked = vma->vm_mm->locked_vm + extra; + + if ((locked > lock_limit) && perf_paranoid_tracepoint_raw() && + !capable(CAP_IPC_LOCK)) { + ret = -EPERM; + goto unlock; + } + + WARN_ON(event->data); + ret = perf_mmap_data_alloc(event, nr_pages); + if (ret) + goto unlock; + + atomic_set(&event->mmap_count, 1); + atomic_long_add(user_extra, &user->locked_vm); + vma->vm_mm->locked_vm += extra; + event->data->nr_locked = extra; + if (vma->vm_flags & VM_WRITE) + event->data->writable = 1; + +unlock: + mutex_unlock(&event->mmap_mutex); + + vma->vm_flags |= VM_RESERVED; + vma->vm_ops = &perf_mmap_vmops; + + return ret; +} + +static int perf_fasync(int fd, struct file *filp, int on) +{ + struct inode *inode = filp->f_path.dentry->d_inode; + struct perf_event *event = filp->private_data; + int retval; + + mutex_lock(&inode->i_mutex); + retval = fasync_helper(fd, filp, on, &event->fasync); + mutex_unlock(&inode->i_mutex); + + if (retval < 0) + return retval; + + return 0; +} + +static const struct file_operations perf_fops = { + .release = perf_release, + .read = perf_read, + .poll = perf_poll, + .unlocked_ioctl = perf_ioctl, + .compat_ioctl = perf_ioctl, + .mmap = perf_mmap, + .fasync = perf_fasync, +}; + +/* + * Perf event wakeup + * + * If there's data, ensure we set the poll() state and publish everything + * to user-space before waking everybody up. + */ + +void perf_event_wakeup(struct perf_event *event) +{ + wake_up_all(&event->waitq); + + if (event->pending_kill) { + kill_fasync(&event->fasync, SIGIO, event->pending_kill); + event->pending_kill = 0; + } +} + +/* + * Pending wakeups + * + * Handle the case where we need to wakeup up from NMI (or rq->lock) context. + * + * The NMI bit means we cannot possibly take locks. Therefore, maintain a + * single linked list and use cmpxchg() to add entries lockless. + */ + +static void perf_pending_event(struct perf_pending_entry *entry) +{ + struct perf_event *event = container_of(entry, + struct perf_event, pending); + + if (event->pending_disable) { + event->pending_disable = 0; + __perf_event_disable(event); + } + + if (event->pending_wakeup) { + event->pending_wakeup = 0; + perf_event_wakeup(event); + } +} + +#define PENDING_TAIL ((struct perf_pending_entry *)-1UL) + +static DEFINE_PER_CPU(struct perf_pending_entry *, perf_pending_head) = { + PENDING_TAIL, +}; + +static void perf_pending_queue(struct perf_pending_entry *entry, + void (*func)(struct perf_pending_entry *)) +{ + struct perf_pending_entry **head; + + if (cmpxchg(&entry->next, NULL, PENDING_TAIL) != NULL) + return; + + entry->func = func; + + head = &get_cpu_var(perf_pending_head); + + do { + entry->next = *head; + } while (cmpxchg(head, entry->next, entry) != entry->next); + + set_perf_event_pending(); + + put_cpu_var(perf_pending_head); +} + +static int __perf_pending_run(void) +{ + struct perf_pending_entry *list; + int nr = 0; + + list = xchg(&__get_cpu_var(perf_pending_head), PENDING_TAIL); + while (list != PENDING_TAIL) { + void (*func)(struct perf_pending_entry *); + struct perf_pending_entry *entry = list; + + list = list->next; + + func = entry->func; + entry->next = NULL; + /* + * Ensure we observe the unqueue before we issue the wakeup, + * so that we won't be waiting forever. + * -- see perf_not_pending(). + */ + smp_wmb(); + + func(entry); + nr++; + } + + return nr; +} + +static inline int perf_not_pending(struct perf_event *event) +{ + /* + * If we flush on whatever cpu we run, there is a chance we don't + * need to wait. + */ + get_cpu(); + __perf_pending_run(); + put_cpu(); + + /* + * Ensure we see the proper queue state before going to sleep + * so that we do not miss the wakeup. -- see perf_pending_handle() + */ + smp_rmb(); + return event->pending.next == NULL; +} + +static void perf_pending_sync(struct perf_event *event) +{ + wait_event(event->waitq, perf_not_pending(event)); +} + +void perf_event_do_pending(void) +{ + __perf_pending_run(); +} + +/* + * Callchain support -- arch specific + */ + +__weak struct perf_callchain_entry *perf_callchain(struct pt_regs *regs) +{ + return NULL; +} + +/* + * Output + */ +static bool perf_output_space(struct perf_mmap_data *data, unsigned long tail, + unsigned long offset, unsigned long head) +{ + unsigned long mask; + + if (!data->writable) + return true; + + mask = (data->nr_pages << PAGE_SHIFT) - 1; + + offset = (offset - tail) & mask; + head = (head - tail) & mask; + + if ((int)(head - offset) < 0) + return false; + + return true; +} + +static void perf_output_wakeup(struct perf_output_handle *handle) +{ + atomic_set(&handle->data->poll, POLL_IN); + + if (handle->nmi) { + handle->event->pending_wakeup = 1; + perf_pending_queue(&handle->event->pending, + perf_pending_event); + } else + perf_event_wakeup(handle->event); +} + +/* + * Curious locking construct. + * + * We need to ensure a later event_id doesn't publish a head when a former + * event_id isn't done writing. However since we need to deal with NMIs we + * cannot fully serialize things. + * + * What we do is serialize between CPUs so we only have to deal with NMI + * nesting on a single CPU. + * + * We only publish the head (and generate a wakeup) when the outer-most + * event_id completes. + */ +static void perf_output_lock(struct perf_output_handle *handle) +{ + struct perf_mmap_data *data = handle->data; + int cpu; + + handle->locked = 0; + + local_irq_save(handle->flags); + cpu = smp_processor_id(); + + if (in_nmi() && atomic_read(&data->lock) == cpu) + return; + + while (atomic_cmpxchg(&data->lock, -1, cpu) != -1) + cpu_relax(); + + handle->locked = 1; +} + +static void perf_output_unlock(struct perf_output_handle *handle) +{ + struct perf_mmap_data *data = handle->data; + unsigned long head; + int cpu; + + data->done_head = data->head; + + if (!handle->locked) + goto out; + +again: + /* + * The xchg implies a full barrier that ensures all writes are done + * before we publish the new head, matched by a rmb() in userspace when + * reading this position. + */ + while ((head = atomic_long_xchg(&data->done_head, 0))) + data->user_page->data_head = head; + + /* + * NMI can happen here, which means we can miss a done_head update. + */ + + cpu = atomic_xchg(&data->lock, -1); + WARN_ON_ONCE(cpu != smp_processor_id()); + + /* + * Therefore we have to validate we did not indeed do so. + */ + if (unlikely(atomic_long_read(&data->done_head))) { + /* + * Since we had it locked, we can lock it again. + */ + while (atomic_cmpxchg(&data->lock, -1, cpu) != -1) + cpu_relax(); + + goto again; + } + + if (atomic_xchg(&data->wakeup, 0)) + perf_output_wakeup(handle); +out: + local_irq_restore(handle->flags); +} + +void perf_output_copy(struct perf_output_handle *handle, + const void *buf, unsigned int len) +{ + unsigned int pages_mask; + unsigned int offset; + unsigned int size; + void **pages; + + offset = handle->offset; + pages_mask = handle->data->nr_pages - 1; + pages = handle->data->data_pages; + + do { + unsigned int page_offset; + int nr; + + nr = (offset >> PAGE_SHIFT) & pages_mask; + page_offset = offset & (PAGE_SIZE - 1); + size = min_t(unsigned int, PAGE_SIZE - page_offset, len); + + memcpy(pages[nr] + page_offset, buf, size); + + len -= size; + buf += size; + offset += size; + } while (len); + + handle->offset = offset; + + /* + * Check we didn't copy past our reservation window, taking the + * possible unsigned int wrap into account. + */ + WARN_ON_ONCE(((long)(handle->head - handle->offset)) < 0); +} + +int perf_output_begin(struct perf_output_handle *handle, + struct perf_event *event, unsigned int size, + int nmi, int sample) +{ + struct perf_event *output_event; + struct perf_mmap_data *data; + unsigned long tail, offset, head; + int have_lost; + struct { + struct perf_event_header header; + u64 id; + u64 lost; + } lost_event; + + rcu_read_lock(); + /* + * For inherited events we send all the output towards the parent. + */ + if (event->parent) + event = event->parent; + + output_event = rcu_dereference(event->output); + if (output_event) + event = output_event; + + data = rcu_dereference(event->data); + if (!data) + goto out; + + handle->data = data; + handle->event = event; + handle->nmi = nmi; + handle->sample = sample; + + if (!data->nr_pages) + goto fail; + + have_lost = atomic_read(&data->lost); + if (have_lost) + size += sizeof(lost_event); + + perf_output_lock(handle); + + do { + /* + * Userspace could choose to issue a mb() before updating the + * tail pointer. So that all reads will be completed before the + * write is issued. + */ + tail = ACCESS_ONCE(data->user_page->data_tail); + smp_rmb(); + offset = head = atomic_long_read(&data->head); + head += size; + if (unlikely(!perf_output_space(data, tail, offset, head))) + goto fail; + } while (atomic_long_cmpxchg(&data->head, offset, head) != offset); + + handle->offset = offset; + handle->head = head; + + if (head - tail > data->watermark) + atomic_set(&data->wakeup, 1); + + if (have_lost) { + lost_event.header.type = PERF_RECORD_LOST; + lost_event.header.misc = 0; + lost_event.header.size = sizeof(lost_event); + lost_event.id = event->id; + lost_event.lost = atomic_xchg(&data->lost, 0); + + perf_output_put(handle, lost_event); + } + + return 0; + +fail: + atomic_inc(&data->lost); + perf_output_unlock(handle); +out: + rcu_read_unlock(); + + return -ENOSPC; +} + +void perf_output_end(struct perf_output_handle *handle) +{ + struct perf_event *event = handle->event; + struct perf_mmap_data *data = handle->data; + + int wakeup_events = event->attr.wakeup_events; + + if (handle->sample && wakeup_events) { + int events = atomic_inc_return(&data->events); + if (events >= wakeup_events) { + atomic_sub(wakeup_events, &data->events); + atomic_set(&data->wakeup, 1); + } + } + + perf_output_unlock(handle); + rcu_read_unlock(); +} + +static u32 perf_event_pid(struct perf_event *event, struct task_struct *p) +{ + /* + * only top level events have the pid namespace they were created in + */ + if (event->parent) + event = event->parent; + + return task_tgid_nr_ns(p, event->ns); +} + +static u32 perf_event_tid(struct perf_event *event, struct task_struct *p) +{ + /* + * only top level events have the pid namespace they were created in + */ + if (event->parent) + event = event->parent; + + return task_pid_nr_ns(p, event->ns); +} + +static void perf_output_read_one(struct perf_output_handle *handle, + struct perf_event *event) +{ + u64 read_format = event->attr.read_format; + u64 values[4]; + int n = 0; + + values[n++] = atomic64_read(&event->count); + if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) { + values[n++] = event->total_time_enabled + + atomic64_read(&event->child_total_time_enabled); + } + if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) { + values[n++] = event->total_time_running + + atomic64_read(&event->child_total_time_running); + } + if (read_format & PERF_FORMAT_ID) + values[n++] = primary_event_id(event); + + perf_output_copy(handle, values, n * sizeof(u64)); +} + +/* + * XXX PERF_FORMAT_GROUP vs inherited events seems difficult. + */ +static void perf_output_read_group(struct perf_output_handle *handle, + struct perf_event *event) +{ + struct perf_event *leader = event->group_leader, *sub; + u64 read_format = event->attr.read_format; + u64 values[5]; + int n = 0; + + values[n++] = 1 + leader->nr_siblings; + + if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) + values[n++] = leader->total_time_enabled; + + if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) + values[n++] = leader->total_time_running; + + if (leader != event) + leader->pmu->read(leader); + + values[n++] = atomic64_read(&leader->count); + if (read_format & PERF_FORMAT_ID) + values[n++] = primary_event_id(leader); + + perf_output_copy(handle, values, n * sizeof(u64)); + + list_for_each_entry(sub, &leader->sibling_list, group_entry) { + n = 0; + + if (sub != event) + sub->pmu->read(sub); + + values[n++] = atomic64_read(&sub->count); + if (read_format & PERF_FORMAT_ID) + values[n++] = primary_event_id(sub); + + perf_output_copy(handle, values, n * sizeof(u64)); + } +} + +static void perf_output_read(struct perf_output_handle *handle, + struct perf_event *event) +{ + if (event->attr.read_format & PERF_FORMAT_GROUP) + perf_output_read_group(handle, event); + else + perf_output_read_one(handle, event); +} + +void perf_output_sample(struct perf_output_handle *handle, + struct perf_event_header *header, + struct perf_sample_data *data, + struct perf_event *event) +{ + u64 sample_type = data->type; + + perf_output_put(handle, *header); + + if (sample_type & PERF_SAMPLE_IP) + perf_output_put(handle, data->ip); + + if (sample_type & PERF_SAMPLE_TID) + perf_output_put(handle, data->tid_entry); + + if (sample_type & PERF_SAMPLE_TIME) + perf_output_put(handle, data->time); + + if (sample_type & PERF_SAMPLE_ADDR) + perf_output_put(handle, data->addr); + + if (sample_type & PERF_SAMPLE_ID) + perf_output_put(handle, data->id); + + if (sample_type & PERF_SAMPLE_STREAM_ID) + perf_output_put(handle, data->stream_id); + + if (sample_type & PERF_SAMPLE_CPU) + perf_output_put(handle, data->cpu_entry); + + if (sample_type & PERF_SAMPLE_PERIOD) + perf_output_put(handle, data->period); + + if (sample_type & PERF_SAMPLE_READ) + perf_output_read(handle, event); + + if (sample_type & PERF_SAMPLE_CALLCHAIN) { + if (data->callchain) { + int size = 1; + + if (data->callchain) + size += data->callchain->nr; + + size *= sizeof(u64); + + perf_output_copy(handle, data->callchain, size); + } else { + u64 nr = 0; + perf_output_put(handle, nr); + } + } + + if (sample_type & PERF_SAMPLE_RAW) { + if (data->raw) { + perf_output_put(handle, data->raw->size); + perf_output_copy(handle, data->raw->data, + data->raw->size); + } else { + struct { + u32 size; + u32 data; + } raw = { + .size = sizeof(u32), + .data = 0, + }; + perf_output_put(handle, raw); + } + } +} + +void perf_prepare_sample(struct perf_event_header *header, + struct perf_sample_data *data, + struct perf_event *event, + struct pt_regs *regs) +{ + u64 sample_type = event->attr.sample_type; + + data->type = sample_type; + + header->type = PERF_RECORD_SAMPLE; + header->size = sizeof(*header); + + header->misc = 0; + header->misc |= perf_misc_flags(regs); + + if (sample_type & PERF_SAMPLE_IP) { + data->ip = perf_instruction_pointer(regs); + + header->size += sizeof(data->ip); + } + + if (sample_type & PERF_SAMPLE_TID) { + /* namespace issues */ + data->tid_entry.pid = perf_event_pid(event, current); + data->tid_entry.tid = perf_event_tid(event, current); + + header->size += sizeof(data->tid_entry); + } + + if (sample_type & PERF_SAMPLE_TIME) { + data->time = perf_clock(); + + header->size += sizeof(data->time); + } + + if (sample_type & PERF_SAMPLE_ADDR) + header->size += sizeof(data->addr); + + if (sample_type & PERF_SAMPLE_ID) { + data->id = primary_event_id(event); + + header->size += sizeof(data->id); + } + + if (sample_type & PERF_SAMPLE_STREAM_ID) { + data->stream_id = event->id; + + header->size += sizeof(data->stream_id); + } + + if (sample_type & PERF_SAMPLE_CPU) { + data->cpu_entry.cpu = raw_smp_processor_id(); + data->cpu_entry.reserved = 0; + + header->size += sizeof(data->cpu_entry); + } + + if (sample_type & PERF_SAMPLE_PERIOD) + header->size += sizeof(data->period); + + if (sample_type & PERF_SAMPLE_READ) + header->size += perf_event_read_size(event); + + if (sample_type & PERF_SAMPLE_CALLCHAIN) { + int size = 1; + + data->callchain = perf_callchain(regs); + + if (data->callchain) + size += data->callchain->nr; + + header->size += size * sizeof(u64); + } + + if (sample_type & PERF_SAMPLE_RAW) { + int size = sizeof(u32); + + if (data->raw) + size += data->raw->size; + else + size += sizeof(u32); + + WARN_ON_ONCE(size & (sizeof(u64)-1)); + header->size += size; + } +} + +static void perf_event_output(struct perf_event *event, int nmi, + struct perf_sample_data *data, + struct pt_regs *regs) +{ + struct perf_output_handle handle; + struct perf_event_header header; + + perf_prepare_sample(&header, data, event, regs); + + if (perf_output_begin(&handle, event, header.size, nmi, 1)) + return; + + perf_output_sample(&handle, &header, data, event); + + perf_output_end(&handle); +} + +/* + * read event_id + */ + +struct perf_read_event { + struct perf_event_header header; + + u32 pid; + u32 tid; +}; + +static void +perf_event_read_event(struct perf_event *event, + struct task_struct *task) +{ + struct perf_output_handle handle; + struct perf_read_event read_event = { + .header = { + .type = PERF_RECORD_READ, + .misc = 0, + .size = sizeof(read_event) + perf_event_read_size(event), + }, + .pid = perf_event_pid(event, task), + .tid = perf_event_tid(event, task), + }; + int ret; + + ret = perf_output_begin(&handle, event, read_event.header.size, 0, 0); + if (ret) + return; + + perf_output_put(&handle, read_event); + perf_output_read(&handle, event); + + perf_output_end(&handle); +} + +/* + * task tracking -- fork/exit + * + * enabled by: attr.comm | attr.mmap | attr.task + */ + +struct perf_task_event { + struct task_struct *task; + struct perf_event_context *task_ctx; + + struct { + struct perf_event_header header; + + u32 pid; + u32 ppid; + u32 tid; + u32 ptid; + u64 time; + } event_id; +}; + +static void perf_event_task_output(struct perf_event *event, + struct perf_task_event *task_event) +{ + struct perf_output_handle handle; + int size; + struct task_struct *task = task_event->task; + int ret; + + size = task_event->event_id.header.size; + ret = perf_output_begin(&handle, event, size, 0, 0); + + if (ret) + return; + + task_event->event_id.pid = perf_event_pid(event, task); + task_event->event_id.ppid = perf_event_pid(event, current); + + task_event->event_id.tid = perf_event_tid(event, task); + task_event->event_id.ptid = perf_event_tid(event, current); + + task_event->event_id.time = perf_clock(); + + perf_output_put(&handle, task_event->event_id); + + perf_output_end(&handle); +} + +static int perf_event_task_match(struct perf_event *event) +{ + if (event->attr.comm || event->attr.mmap || event->attr.task) + return 1; + + return 0; +} + +static void perf_event_task_ctx(struct perf_event_context *ctx, + struct perf_task_event *task_event) +{ + struct perf_event *event; + + if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list)) + return; + + rcu_read_lock(); + list_for_each_entry_rcu(event, &ctx->event_list, event_entry) { + if (perf_event_task_match(event)) + perf_event_task_output(event, task_event); + } + rcu_read_unlock(); +} + +static void perf_event_task_event(struct perf_task_event *task_event) +{ + struct perf_cpu_context *cpuctx; + struct perf_event_context *ctx = task_event->task_ctx; + + cpuctx = &get_cpu_var(perf_cpu_context); + perf_event_task_ctx(&cpuctx->ctx, task_event); + put_cpu_var(perf_cpu_context); + + rcu_read_lock(); + if (!ctx) + ctx = rcu_dereference(task_event->task->perf_event_ctxp); + if (ctx) + perf_event_task_ctx(ctx, task_event); + rcu_read_unlock(); +} + +static void perf_event_task(struct task_struct *task, + struct perf_event_context *task_ctx, + int new) +{ + struct perf_task_event task_event; + + if (!atomic_read(&nr_comm_events) && + !atomic_read(&nr_mmap_events) && + !atomic_read(&nr_task_events)) + return; + + task_event = (struct perf_task_event){ + .task = task, + .task_ctx = task_ctx, + .event_id = { + .header = { + .type = new ? PERF_RECORD_FORK : PERF_RECORD_EXIT, + .misc = 0, + .size = sizeof(task_event.event_id), + }, + /* .pid */ + /* .ppid */ + /* .tid */ + /* .ptid */ + }, + }; + + perf_event_task_event(&task_event); +} + +void perf_event_fork(struct task_struct *task) +{ + perf_event_task(task, NULL, 1); +} + +/* + * comm tracking + */ + +struct perf_comm_event { + struct task_struct *task; + char *comm; + int comm_size; + + struct { + struct perf_event_header header; + + u32 pid; + u32 tid; + } event_id; +}; + +static void perf_event_comm_output(struct perf_event *event, + struct perf_comm_event *comm_event) +{ + struct perf_output_handle handle; + int size = comm_event->event_id.header.size; + int ret = perf_output_begin(&handle, event, size, 0, 0); + + if (ret) + return; + + comm_event->event_id.pid = perf_event_pid(event, comm_event->task); + comm_event->event_id.tid = perf_event_tid(event, comm_event->task); + + perf_output_put(&handle, comm_event->event_id); + perf_output_copy(&handle, comm_event->comm, + comm_event->comm_size); + perf_output_end(&handle); +} + +static int perf_event_comm_match(struct perf_event *event) +{ + if (event->attr.comm) + return 1; + + return 0; +} + +static void perf_event_comm_ctx(struct perf_event_context *ctx, + struct perf_comm_event *comm_event) +{ + struct perf_event *event; + + if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list)) + return; + + rcu_read_lock(); + list_for_each_entry_rcu(event, &ctx->event_list, event_entry) { + if (perf_event_comm_match(event)) + perf_event_comm_output(event, comm_event); + } + rcu_read_unlock(); +} + +static void perf_event_comm_event(struct perf_comm_event *comm_event) +{ + struct perf_cpu_context *cpuctx; + struct perf_event_context *ctx; + unsigned int size; + char comm[TASK_COMM_LEN]; + + memset(comm, 0, sizeof(comm)); + strncpy(comm, comm_event->task->comm, sizeof(comm)); + size = ALIGN(strlen(comm)+1, sizeof(u64)); + + comm_event->comm = comm; + comm_event->comm_size = size; + + comm_event->event_id.header.size = sizeof(comm_event->event_id) + size; + + cpuctx = &get_cpu_var(perf_cpu_context); + perf_event_comm_ctx(&cpuctx->ctx, comm_event); + put_cpu_var(perf_cpu_context); + + rcu_read_lock(); + /* + * doesn't really matter which of the child contexts the + * events ends up in. + */ + ctx = rcu_dereference(current->perf_event_ctxp); + if (ctx) + perf_event_comm_ctx(ctx, comm_event); + rcu_read_unlock(); +} + +void perf_event_comm(struct task_struct *task) +{ + struct perf_comm_event comm_event; + + if (task->perf_event_ctxp) + perf_event_enable_on_exec(task); + + if (!atomic_read(&nr_comm_events)) + return; + + comm_event = (struct perf_comm_event){ + .task = task, + /* .comm */ + /* .comm_size */ + .event_id = { + .header = { + .type = PERF_RECORD_COMM, + .misc = 0, + /* .size */ + }, + /* .pid */ + /* .tid */ + }, + }; + + perf_event_comm_event(&comm_event); +} + +/* + * mmap tracking + */ + +struct perf_mmap_event { + struct vm_area_struct *vma; + + const char *file_name; + int file_size; + + struct { + struct perf_event_header header; + + u32 pid; + u32 tid; + u64 start; + u64 len; + u64 pgoff; + } event_id; +}; + +static void perf_event_mmap_output(struct perf_event *event, + struct perf_mmap_event *mmap_event) +{ + struct perf_output_handle handle; + int size = mmap_event->event_id.header.size; + int ret = perf_output_begin(&handle, event, size, 0, 0); + + if (ret) + return; + + mmap_event->event_id.pid = perf_event_pid(event, current); + mmap_event->event_id.tid = perf_event_tid(event, current); + + perf_output_put(&handle, mmap_event->event_id); + perf_output_copy(&handle, mmap_event->file_name, + mmap_event->file_size); + perf_output_end(&handle); +} + +static int perf_event_mmap_match(struct perf_event *event, + struct perf_mmap_event *mmap_event) +{ + if (event->attr.mmap) + return 1; + + return 0; +} + +static void perf_event_mmap_ctx(struct perf_event_context *ctx, + struct perf_mmap_event *mmap_event) +{ + struct perf_event *event; + + if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list)) + return; + + rcu_read_lock(); + list_for_each_entry_rcu(event, &ctx->event_list, event_entry) { + if (perf_event_mmap_match(event, mmap_event)) + perf_event_mmap_output(event, mmap_event); + } + rcu_read_unlock(); +} + +static void perf_event_mmap_event(struct perf_mmap_event *mmap_event) +{ + struct perf_cpu_context *cpuctx; + struct perf_event_context *ctx; + struct vm_area_struct *vma = mmap_event->vma; + struct file *file = vma->vm_file; + unsigned int size; + char tmp[16]; + char *buf = NULL; + const char *name; + + memset(tmp, 0, sizeof(tmp)); + + if (file) { + /* + * d_path works from the end of the buffer backwards, so we + * need to add enough zero bytes after the string to handle + * the 64bit alignment we do later. + */ + buf = kzalloc(PATH_MAX + sizeof(u64), GFP_KERNEL); + if (!buf) { + name = strncpy(tmp, "//enomem", sizeof(tmp)); + goto got_name; + } + name = d_path(&file->f_path, buf, PATH_MAX); + if (IS_ERR(name)) { + name = strncpy(tmp, "//toolong", sizeof(tmp)); + goto got_name; + } + } else { + if (arch_vma_name(mmap_event->vma)) { + name = strncpy(tmp, arch_vma_name(mmap_event->vma), + sizeof(tmp)); + goto got_name; + } + + if (!vma->vm_mm) { + name = strncpy(tmp, "[vdso]", sizeof(tmp)); + goto got_name; + } + + name = strncpy(tmp, "//anon", sizeof(tmp)); + goto got_name; + } + +got_name: + size = ALIGN(strlen(name)+1, sizeof(u64)); + + mmap_event->file_name = name; + mmap_event->file_size = size; + + mmap_event->event_id.header.size = sizeof(mmap_event->event_id) + size; + + cpuctx = &get_cpu_var(perf_cpu_context); + perf_event_mmap_ctx(&cpuctx->ctx, mmap_event); + put_cpu_var(perf_cpu_context); + + rcu_read_lock(); + /* + * doesn't really matter which of the child contexts the + * events ends up in. + */ + ctx = rcu_dereference(current->perf_event_ctxp); + if (ctx) + perf_event_mmap_ctx(ctx, mmap_event); + rcu_read_unlock(); + + kfree(buf); +} + +void __perf_event_mmap(struct vm_area_struct *vma) +{ + struct perf_mmap_event mmap_event; + + if (!atomic_read(&nr_mmap_events)) + return; + + mmap_event = (struct perf_mmap_event){ + .vma = vma, + /* .file_name */ + /* .file_size */ + .event_id = { + .header = { + .type = PERF_RECORD_MMAP, + .misc = 0, + /* .size */ + }, + /* .pid */ + /* .tid */ + .start = vma->vm_start, + .len = vma->vm_end - vma->vm_start, + .pgoff = vma->vm_pgoff, + }, + }; + + perf_event_mmap_event(&mmap_event); +} + +/* + * IRQ throttle logging + */ + +static void perf_log_throttle(struct perf_event *event, int enable) +{ + struct perf_output_handle handle; + int ret; + + struct { + struct perf_event_header header; + u64 time; + u64 id; + u64 stream_id; + } throttle_event = { + .header = { + .type = PERF_RECORD_THROTTLE, + .misc = 0, + .size = sizeof(throttle_event), + }, + .time = perf_clock(), + .id = primary_event_id(event), + .stream_id = event->id, + }; + + if (enable) + throttle_event.header.type = PERF_RECORD_UNTHROTTLE; + + ret = perf_output_begin(&handle, event, sizeof(throttle_event), 1, 0); + if (ret) + return; + + perf_output_put(&handle, throttle_event); + perf_output_end(&handle); +} + +/* + * Generic event overflow handling, sampling. + */ + +static int __perf_event_overflow(struct perf_event *event, int nmi, + int throttle, struct perf_sample_data *data, + struct pt_regs *regs) +{ + int events = atomic_read(&event->event_limit); + struct hw_perf_event *hwc = &event->hw; + int ret = 0; + + throttle = (throttle && event->pmu->unthrottle != NULL); + + if (!throttle) { + hwc->interrupts++; + } else { + if (hwc->interrupts != MAX_INTERRUPTS) { + hwc->interrupts++; + if (HZ * hwc->interrupts > + (u64)sysctl_perf_event_sample_rate) { + hwc->interrupts = MAX_INTERRUPTS; + perf_log_throttle(event, 0); + ret = 1; + } + } else { + /* + * Keep re-disabling events even though on the previous + * pass we disabled it - just in case we raced with a + * sched-in and the event got enabled again: + */ + ret = 1; + } + } + + if (event->attr.freq) { + u64 now = perf_clock(); + s64 delta = now - hwc->freq_stamp; + + hwc->freq_stamp = now; + + if (delta > 0 && delta < TICK_NSEC) + perf_adjust_period(event, NSEC_PER_SEC / (int)delta); + } + + /* + * XXX event_limit might not quite work as expected on inherited + * events + */ + + event->pending_kill = POLL_IN; + if (events && atomic_dec_and_test(&event->event_limit)) { + ret = 1; + event->pending_kill = POLL_HUP; + if (nmi) { + event->pending_disable = 1; + perf_pending_queue(&event->pending, + perf_pending_event); + } else + perf_event_disable(event); + } + + perf_event_output(event, nmi, data, regs); + return ret; +} + +int perf_event_overflow(struct perf_event *event, int nmi, + struct perf_sample_data *data, + struct pt_regs *regs) +{ + return __perf_event_overflow(event, nmi, 1, data, regs); +} + +/* + * Generic software event infrastructure + */ + +/* + * We directly increment event->count and keep a second value in + * event->hw.period_left to count intervals. This period event + * is kept in the range [-sample_period, 0] so that we can use the + * sign as trigger. + */ + +static u64 perf_swevent_set_period(struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + u64 period = hwc->last_period; + u64 nr, offset; + s64 old, val; + + hwc->last_period = hwc->sample_period; + +again: + old = val = atomic64_read(&hwc->period_left); + if (val < 0) + return 0; + + nr = div64_u64(period + val, period); + offset = nr * period; + val -= offset; + if (atomic64_cmpxchg(&hwc->period_left, old, val) != old) + goto again; + + return nr; +} + +static void perf_swevent_overflow(struct perf_event *event, + int nmi, struct perf_sample_data *data, + struct pt_regs *regs) +{ + struct hw_perf_event *hwc = &event->hw; + int throttle = 0; + u64 overflow; + + data->period = event->hw.last_period; + overflow = perf_swevent_set_period(event); + + if (hwc->interrupts == MAX_INTERRUPTS) + return; + + for (; overflow; overflow--) { + if (__perf_event_overflow(event, nmi, throttle, + data, regs)) { + /* + * We inhibit the overflow from happening when + * hwc->interrupts == MAX_INTERRUPTS. + */ + break; + } + throttle = 1; + } +} + +static void perf_swevent_unthrottle(struct perf_event *event) +{ + /* + * Nothing to do, we already reset hwc->interrupts. + */ +} + +static void perf_swevent_add(struct perf_event *event, u64 nr, + int nmi, struct perf_sample_data *data, + struct pt_regs *regs) +{ + struct hw_perf_event *hwc = &event->hw; + + atomic64_add(nr, &event->count); + + if (!hwc->sample_period) + return; + + if (!regs) + return; + + if (!atomic64_add_negative(nr, &hwc->period_left)) + perf_swevent_overflow(event, nmi, data, regs); +} + +static int perf_swevent_is_counting(struct perf_event *event) +{ + /* + * The event is active, we're good! + */ + if (event->state == PERF_EVENT_STATE_ACTIVE) + return 1; + + /* + * The event is off/error, not counting. + */ + if (event->state != PERF_EVENT_STATE_INACTIVE) + return 0; + + /* + * The event is inactive, if the context is active + * we're part of a group that didn't make it on the 'pmu', + * not counting. + */ + if (event->ctx->is_active) + return 0; + + /* + * We're inactive and the context is too, this means the + * task is scheduled out, we're counting events that happen + * to us, like migration events. + */ + return 1; +} + +static int perf_swevent_match(struct perf_event *event, + enum perf_type_id type, + u32 event_id, struct pt_regs *regs) +{ + if (!perf_swevent_is_counting(event)) + return 0; + + if (event->attr.type != type) + return 0; + if (event->attr.config != event_id) + return 0; + + if (regs) { + if (event->attr.exclude_user && user_mode(regs)) + return 0; + + if (event->attr.exclude_kernel && !user_mode(regs)) + return 0; + } + + return 1; +} + +static void perf_swevent_ctx_event(struct perf_event_context *ctx, + enum perf_type_id type, + u32 event_id, u64 nr, int nmi, + struct perf_sample_data *data, + struct pt_regs *regs) +{ + struct perf_event *event; + + if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list)) + return; + + rcu_read_lock(); + list_for_each_entry_rcu(event, &ctx->event_list, event_entry) { + if (perf_swevent_match(event, type, event_id, regs)) + perf_swevent_add(event, nr, nmi, data, regs); + } + rcu_read_unlock(); +} + +static int *perf_swevent_recursion_context(struct perf_cpu_context *cpuctx) +{ + if (in_nmi()) + return &cpuctx->recursion[3]; + + if (in_irq()) + return &cpuctx->recursion[2]; + + if (in_softirq()) + return &cpuctx->recursion[1]; + + return &cpuctx->recursion[0]; +} + +static void do_perf_sw_event(enum perf_type_id type, u32 event_id, + u64 nr, int nmi, + struct perf_sample_data *data, + struct pt_regs *regs) +{ + struct perf_cpu_context *cpuctx = &get_cpu_var(perf_cpu_context); + int *recursion = perf_swevent_recursion_context(cpuctx); + struct perf_event_context *ctx; + + if (*recursion) + goto out; + + (*recursion)++; + barrier(); + + perf_swevent_ctx_event(&cpuctx->ctx, type, event_id, + nr, nmi, data, regs); + rcu_read_lock(); + /* + * doesn't really matter which of the child contexts the + * events ends up in. + */ + ctx = rcu_dereference(current->perf_event_ctxp); + if (ctx) + perf_swevent_ctx_event(ctx, type, event_id, nr, nmi, data, regs); + rcu_read_unlock(); + + barrier(); + (*recursion)--; + +out: + put_cpu_var(perf_cpu_context); +} + +void __perf_sw_event(u32 event_id, u64 nr, int nmi, + struct pt_regs *regs, u64 addr) +{ + struct perf_sample_data data = { + .addr = addr, + }; + + do_perf_sw_event(PERF_TYPE_SOFTWARE, event_id, nr, nmi, + &data, regs); +} + +static void perf_swevent_read(struct perf_event *event) +{ +} + +static int perf_swevent_enable(struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + + if (hwc->sample_period) { + hwc->last_period = hwc->sample_period; + perf_swevent_set_period(event); + } + return 0; +} + +static void perf_swevent_disable(struct perf_event *event) +{ +} + +static const struct pmu perf_ops_generic = { + .enable = perf_swevent_enable, + .disable = perf_swevent_disable, + .read = perf_swevent_read, + .unthrottle = perf_swevent_unthrottle, +}; + +/* + * hrtimer based swevent callback + */ + +static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer) +{ + enum hrtimer_restart ret = HRTIMER_RESTART; + struct perf_sample_data data; + struct pt_regs *regs; + struct perf_event *event; + u64 period; + + event = container_of(hrtimer, struct perf_event, hw.hrtimer); + event->pmu->read(event); + + data.addr = 0; + regs = get_irq_regs(); + /* + * In case we exclude kernel IPs or are somehow not in interrupt + * context, provide the next best thing, the user IP. + */ + if ((event->attr.exclude_kernel || !regs) && + !event->attr.exclude_user) + regs = task_pt_regs(current); + + if (regs) { + if (perf_event_overflow(event, 0, &data, regs)) + ret = HRTIMER_NORESTART; + } + + period = max_t(u64, 10000, event->hw.sample_period); + hrtimer_forward_now(hrtimer, ns_to_ktime(period)); + + return ret; +} + +/* + * Software event: cpu wall time clock + */ + +static void cpu_clock_perf_event_update(struct perf_event *event) +{ + int cpu = raw_smp_processor_id(); + s64 prev; + u64 now; + + now = cpu_clock(cpu); + prev = atomic64_read(&event->hw.prev_count); + atomic64_set(&event->hw.prev_count, now); + atomic64_add(now - prev, &event->count); +} + +static int cpu_clock_perf_event_enable(struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + int cpu = raw_smp_processor_id(); + + atomic64_set(&hwc->prev_count, cpu_clock(cpu)); + hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); + hwc->hrtimer.function = perf_swevent_hrtimer; + if (hwc->sample_period) { + u64 period = max_t(u64, 10000, hwc->sample_period); + __hrtimer_start_range_ns(&hwc->hrtimer, + ns_to_ktime(period), 0, + HRTIMER_MODE_REL, 0); + } + + return 0; +} + +static void cpu_clock_perf_event_disable(struct perf_event *event) +{ + if (event->hw.sample_period) + hrtimer_cancel(&event->hw.hrtimer); + cpu_clock_perf_event_update(event); +} + +static void cpu_clock_perf_event_read(struct perf_event *event) +{ + cpu_clock_perf_event_update(event); +} + +static const struct pmu perf_ops_cpu_clock = { + .enable = cpu_clock_perf_event_enable, + .disable = cpu_clock_perf_event_disable, + .read = cpu_clock_perf_event_read, +}; + +/* + * Software event: task time clock + */ + +static void task_clock_perf_event_update(struct perf_event *event, u64 now) +{ + u64 prev; + s64 delta; + + prev = atomic64_xchg(&event->hw.prev_count, now); + delta = now - prev; + atomic64_add(delta, &event->count); +} + +static int task_clock_perf_event_enable(struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + u64 now; + + now = event->ctx->time; + + atomic64_set(&hwc->prev_count, now); + hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); + hwc->hrtimer.function = perf_swevent_hrtimer; + if (hwc->sample_period) { + u64 period = max_t(u64, 10000, hwc->sample_period); + __hrtimer_start_range_ns(&hwc->hrtimer, + ns_to_ktime(period), 0, + HRTIMER_MODE_REL, 0); + } + + return 0; +} + +static void task_clock_perf_event_disable(struct perf_event *event) +{ + if (event->hw.sample_period) + hrtimer_cancel(&event->hw.hrtimer); + task_clock_perf_event_update(event, event->ctx->time); + +} + +static void task_clock_perf_event_read(struct perf_event *event) +{ + u64 time; + + if (!in_nmi()) { + update_context_time(event->ctx); + time = event->ctx->time; + } else { + u64 now = perf_clock(); + u64 delta = now - event->ctx->timestamp; + time = event->ctx->time + delta; + } + + task_clock_perf_event_update(event, time); +} + +static const struct pmu perf_ops_task_clock = { + .enable = task_clock_perf_event_enable, + .disable = task_clock_perf_event_disable, + .read = task_clock_perf_event_read, +}; + +#ifdef CONFIG_EVENT_PROFILE +void perf_tp_event(int event_id, u64 addr, u64 count, void *record, + int entry_size) +{ + struct perf_raw_record raw = { + .size = entry_size, + .data = record, + }; + + struct perf_sample_data data = { + .addr = addr, + .raw = &raw, + }; + + struct pt_regs *regs = get_irq_regs(); + + if (!regs) + regs = task_pt_regs(current); + + do_perf_sw_event(PERF_TYPE_TRACEPOINT, event_id, count, 1, + &data, regs); +} +EXPORT_SYMBOL_GPL(perf_tp_event); + +extern int ftrace_profile_enable(int); +extern void ftrace_profile_disable(int); + +static void tp_perf_event_destroy(struct perf_event *event) +{ + ftrace_profile_disable(event->attr.config); +} + +static const struct pmu *tp_perf_event_init(struct perf_event *event) +{ + /* + * Raw tracepoint data is a severe data leak, only allow root to + * have these. + */ + if ((event->attr.sample_type & PERF_SAMPLE_RAW) && + perf_paranoid_tracepoint_raw() && + !capable(CAP_SYS_ADMIN)) + return ERR_PTR(-EPERM); + + if (ftrace_profile_enable(event->attr.config)) + return NULL; + + event->destroy = tp_perf_event_destroy; + + return &perf_ops_generic; +} +#else +static const struct pmu *tp_perf_event_init(struct perf_event *event) +{ + return NULL; +} +#endif + +atomic_t perf_swevent_enabled[PERF_COUNT_SW_MAX]; + +static void sw_perf_event_destroy(struct perf_event *event) +{ + u64 event_id = event->attr.config; + + WARN_ON(event->parent); + + atomic_dec(&perf_swevent_enabled[event_id]); +} + +static const struct pmu *sw_perf_event_init(struct perf_event *event) +{ + const struct pmu *pmu = NULL; + u64 event_id = event->attr.config; + + /* + * Software events (currently) can't in general distinguish + * between user, kernel and hypervisor events. + * However, context switches and cpu migrations are considered + * to be kernel events, and page faults are never hypervisor + * events. + */ + switch (event_id) { + case PERF_COUNT_SW_CPU_CLOCK: + pmu = &perf_ops_cpu_clock; + + break; + case PERF_COUNT_SW_TASK_CLOCK: + /* + * If the user instantiates this as a per-cpu event, + * use the cpu_clock event instead. + */ + if (event->ctx->task) + pmu = &perf_ops_task_clock; + else + pmu = &perf_ops_cpu_clock; + + break; + case PERF_COUNT_SW_PAGE_FAULTS: + case PERF_COUNT_SW_PAGE_FAULTS_MIN: + case PERF_COUNT_SW_PAGE_FAULTS_MAJ: + case PERF_COUNT_SW_CONTEXT_SWITCHES: + case PERF_COUNT_SW_CPU_MIGRATIONS: + if (!event->parent) { + atomic_inc(&perf_swevent_enabled[event_id]); + event->destroy = sw_perf_event_destroy; + } + pmu = &perf_ops_generic; + break; + } + + return pmu; +} + +/* + * Allocate and initialize a event structure + */ +static struct perf_event * +perf_event_alloc(struct perf_event_attr *attr, + int cpu, + struct perf_event_context *ctx, + struct perf_event *group_leader, + struct perf_event *parent_event, + gfp_t gfpflags) +{ + const struct pmu *pmu; + struct perf_event *event; + struct hw_perf_event *hwc; + long err; + + event = kzalloc(sizeof(*event), gfpflags); + if (!event) + return ERR_PTR(-ENOMEM); + + /* + * Single events are their own group leaders, with an + * empty sibling list: + */ + if (!group_leader) + group_leader = event; + + mutex_init(&event->child_mutex); + INIT_LIST_HEAD(&event->child_list); + + INIT_LIST_HEAD(&event->group_entry); + INIT_LIST_HEAD(&event->event_entry); + INIT_LIST_HEAD(&event->sibling_list); + init_waitqueue_head(&event->waitq); + + mutex_init(&event->mmap_mutex); + + event->cpu = cpu; + event->attr = *attr; + event->group_leader = group_leader; + event->pmu = NULL; + event->ctx = ctx; + event->oncpu = -1; + + event->parent = parent_event; + + event->ns = get_pid_ns(current->nsproxy->pid_ns); + event->id = atomic64_inc_return(&perf_event_id); + + event->state = PERF_EVENT_STATE_INACTIVE; + + if (attr->disabled) + event->state = PERF_EVENT_STATE_OFF; + + pmu = NULL; + + hwc = &event->hw; + hwc->sample_period = attr->sample_period; + if (attr->freq && attr->sample_freq) + hwc->sample_period = 1; + hwc->last_period = hwc->sample_period; + + atomic64_set(&hwc->period_left, hwc->sample_period); + + /* + * we currently do not support PERF_FORMAT_GROUP on inherited events + */ + if (attr->inherit && (attr->read_format & PERF_FORMAT_GROUP)) + goto done; + + switch (attr->type) { + case PERF_TYPE_RAW: + case PERF_TYPE_HARDWARE: + case PERF_TYPE_HW_CACHE: + pmu = hw_perf_event_init(event); + break; + + case PERF_TYPE_SOFTWARE: + pmu = sw_perf_event_init(event); + break; + + case PERF_TYPE_TRACEPOINT: + pmu = tp_perf_event_init(event); + break; + + default: + break; + } +done: + err = 0; + if (!pmu) + err = -EINVAL; + else if (IS_ERR(pmu)) + err = PTR_ERR(pmu); + + if (err) { + if (event->ns) + put_pid_ns(event->ns); + kfree(event); + return ERR_PTR(err); + } + + event->pmu = pmu; + + if (!event->parent) { + atomic_inc(&nr_events); + if (event->attr.mmap) + atomic_inc(&nr_mmap_events); + if (event->attr.comm) + atomic_inc(&nr_comm_events); + if (event->attr.task) + atomic_inc(&nr_task_events); + } + + return event; +} + +static int perf_copy_attr(struct perf_event_attr __user *uattr, + struct perf_event_attr *attr) +{ + u32 size; + int ret; + + if (!access_ok(VERIFY_WRITE, uattr, PERF_ATTR_SIZE_VER0)) + return -EFAULT; + + /* + * zero the full structure, so that a short copy will be nice. + */ + memset(attr, 0, sizeof(*attr)); + + ret = get_user(size, &uattr->size); + if (ret) + return ret; + + if (size > PAGE_SIZE) /* silly large */ + goto err_size; + + if (!size) /* abi compat */ + size = PERF_ATTR_SIZE_VER0; + + if (size < PERF_ATTR_SIZE_VER0) + goto err_size; + + /* + * If we're handed a bigger struct than we know of, + * ensure all the unknown bits are 0 - i.e. new + * user-space does not rely on any kernel feature + * extensions we dont know about yet. + */ + if (size > sizeof(*attr)) { + unsigned char __user *addr; + unsigned char __user *end; + unsigned char val; + + addr = (void __user *)uattr + sizeof(*attr); + end = (void __user *)uattr + size; + + for (; addr < end; addr++) { + ret = get_user(val, addr); + if (ret) + return ret; + if (val) + goto err_size; + } + size = sizeof(*attr); + } + + ret = copy_from_user(attr, uattr, size); + if (ret) + return -EFAULT; + + /* + * If the type exists, the corresponding creation will verify + * the attr->config. + */ + if (attr->type >= PERF_TYPE_MAX) + return -EINVAL; + + if (attr->__reserved_1 || attr->__reserved_2 || attr->__reserved_3) + return -EINVAL; + + if (attr->sample_type & ~(PERF_SAMPLE_MAX-1)) + return -EINVAL; + + if (attr->read_format & ~(PERF_FORMAT_MAX-1)) + return -EINVAL; + +out: + return ret; + +err_size: + put_user(sizeof(*attr), &uattr->size); + ret = -E2BIG; + goto out; +} + +int perf_event_set_output(struct perf_event *event, int output_fd) +{ + struct perf_event *output_event = NULL; + struct file *output_file = NULL; + struct perf_event *old_output; + int fput_needed = 0; + int ret = -EINVAL; + + if (!output_fd) + goto set; + + output_file = fget_light(output_fd, &fput_needed); + if (!output_file) + return -EBADF; + + if (output_file->f_op != &perf_fops) + goto out; + + output_event = output_file->private_data; + + /* Don't chain output fds */ + if (output_event->output) + goto out; + + /* Don't set an output fd when we already have an output channel */ + if (event->data) + goto out; + + atomic_long_inc(&output_file->f_count); + +set: + mutex_lock(&event->mmap_mutex); + old_output = event->output; + rcu_assign_pointer(event->output, output_event); + mutex_unlock(&event->mmap_mutex); + + if (old_output) { + /* + * we need to make sure no existing perf_output_*() + * is still referencing this event. + */ + synchronize_rcu(); + fput(old_output->filp); + } + + ret = 0; +out: + fput_light(output_file, fput_needed); + return ret; +} + +/** + * sys_perf_event_open - open a performance event, associate it to a task/cpu + * + * @attr_uptr: event_id type attributes for monitoring/sampling + * @pid: target pid + * @cpu: target cpu + * @group_fd: group leader event fd + */ +SYSCALL_DEFINE5(perf_event_open, + struct perf_event_attr __user *, attr_uptr, + pid_t, pid, int, cpu, int, group_fd, unsigned long, flags) +{ + struct perf_event *event, *group_leader; + struct perf_event_attr attr; + struct perf_event_context *ctx; + struct file *event_file = NULL; + struct file *group_file = NULL; + int fput_needed = 0; + int fput_needed2 = 0; + int err; + + /* for future expandability... */ + if (flags & ~(PERF_FLAG_FD_NO_GROUP | PERF_FLAG_FD_OUTPUT)) + return -EINVAL; + + err = perf_copy_attr(attr_uptr, &attr); + if (err) + return err; + + if (!attr.exclude_kernel) { + if (perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN)) + return -EACCES; + } + + if (attr.freq) { + if (attr.sample_freq > sysctl_perf_event_sample_rate) + return -EINVAL; + } + + /* + * Get the target context (task or percpu): + */ + ctx = find_get_context(pid, cpu); + if (IS_ERR(ctx)) + return PTR_ERR(ctx); + + /* + * Look up the group leader (we will attach this event to it): + */ + group_leader = NULL; + if (group_fd != -1 && !(flags & PERF_FLAG_FD_NO_GROUP)) { + err = -EINVAL; + group_file = fget_light(group_fd, &fput_needed); + if (!group_file) + goto err_put_context; + if (group_file->f_op != &perf_fops) + goto err_put_context; + + group_leader = group_file->private_data; + /* + * Do not allow a recursive hierarchy (this new sibling + * becoming part of another group-sibling): + */ + if (group_leader->group_leader != group_leader) + goto err_put_context; + /* + * Do not allow to attach to a group in a different + * task or CPU context: + */ + if (group_leader->ctx != ctx) + goto err_put_context; + /* + * Only a group leader can be exclusive or pinned + */ + if (attr.exclusive || attr.pinned) + goto err_put_context; + } + + event = perf_event_alloc(&attr, cpu, ctx, group_leader, + NULL, GFP_KERNEL); + err = PTR_ERR(event); + if (IS_ERR(event)) + goto err_put_context; + + err = anon_inode_getfd("[perf_event]", &perf_fops, event, 0); + if (err < 0) + goto err_free_put_context; + + event_file = fget_light(err, &fput_needed2); + if (!event_file) + goto err_free_put_context; + + if (flags & PERF_FLAG_FD_OUTPUT) { + err = perf_event_set_output(event, group_fd); + if (err) + goto err_fput_free_put_context; + } + + event->filp = event_file; + WARN_ON_ONCE(ctx->parent_ctx); + mutex_lock(&ctx->mutex); + perf_install_in_context(ctx, event, cpu); + ++ctx->generation; + mutex_unlock(&ctx->mutex); + + event->owner = current; + get_task_struct(current); + mutex_lock(¤t->perf_event_mutex); + list_add_tail(&event->owner_entry, ¤t->perf_event_list); + mutex_unlock(¤t->perf_event_mutex); + +err_fput_free_put_context: + fput_light(event_file, fput_needed2); + +err_free_put_context: + if (err < 0) + kfree(event); + +err_put_context: + if (err < 0) + put_ctx(ctx); + + fput_light(group_file, fput_needed); + + return err; +} + +/* + * inherit a event from parent task to child task: + */ +static struct perf_event * +inherit_event(struct perf_event *parent_event, + struct task_struct *parent, + struct perf_event_context *parent_ctx, + struct task_struct *child, + struct perf_event *group_leader, + struct perf_event_context *child_ctx) +{ + struct perf_event *child_event; + + /* + * Instead of creating recursive hierarchies of events, + * we link inherited events back to the original parent, + * which has a filp for sure, which we use as the reference + * count: + */ + if (parent_event->parent) + parent_event = parent_event->parent; + + child_event = perf_event_alloc(&parent_event->attr, + parent_event->cpu, child_ctx, + group_leader, parent_event, + GFP_KERNEL); + if (IS_ERR(child_event)) + return child_event; + get_ctx(child_ctx); + + /* + * Make the child state follow the state of the parent event, + * not its attr.disabled bit. We hold the parent's mutex, + * so we won't race with perf_event_{en, dis}able_family. + */ + if (parent_event->state >= PERF_EVENT_STATE_INACTIVE) + child_event->state = PERF_EVENT_STATE_INACTIVE; + else + child_event->state = PERF_EVENT_STATE_OFF; + + if (parent_event->attr.freq) + child_event->hw.sample_period = parent_event->hw.sample_period; + + /* + * Link it up in the child's context: + */ + add_event_to_ctx(child_event, child_ctx); + + /* + * Get a reference to the parent filp - we will fput it + * when the child event exits. This is safe to do because + * we are in the parent and we know that the filp still + * exists and has a nonzero count: + */ + atomic_long_inc(&parent_event->filp->f_count); + + /* + * Link this into the parent event's child list + */ + WARN_ON_ONCE(parent_event->ctx->parent_ctx); + mutex_lock(&parent_event->child_mutex); + list_add_tail(&child_event->child_list, &parent_event->child_list); + mutex_unlock(&parent_event->child_mutex); + + return child_event; +} + +static int inherit_group(struct perf_event *parent_event, + struct task_struct *parent, + struct perf_event_context *parent_ctx, + struct task_struct *child, + struct perf_event_context *child_ctx) +{ + struct perf_event *leader; + struct perf_event *sub; + struct perf_event *child_ctr; + + leader = inherit_event(parent_event, parent, parent_ctx, + child, NULL, child_ctx); + if (IS_ERR(leader)) + return PTR_ERR(leader); + list_for_each_entry(sub, &parent_event->sibling_list, group_entry) { + child_ctr = inherit_event(sub, parent, parent_ctx, + child, leader, child_ctx); + if (IS_ERR(child_ctr)) + return PTR_ERR(child_ctr); + } + return 0; +} + +static void sync_child_event(struct perf_event *child_event, + struct task_struct *child) +{ + struct perf_event *parent_event = child_event->parent; + u64 child_val; + + if (child_event->attr.inherit_stat) + perf_event_read_event(child_event, child); + + child_val = atomic64_read(&child_event->count); + + /* + * Add back the child's count to the parent's count: + */ + atomic64_add(child_val, &parent_event->count); + atomic64_add(child_event->total_time_enabled, + &parent_event->child_total_time_enabled); + atomic64_add(child_event->total_time_running, + &parent_event->child_total_time_running); + + /* + * Remove this event from the parent's list + */ + WARN_ON_ONCE(parent_event->ctx->parent_ctx); + mutex_lock(&parent_event->child_mutex); + list_del_init(&child_event->child_list); + mutex_unlock(&parent_event->child_mutex); + + /* + * Release the parent event, if this was the last + * reference to it. + */ + fput(parent_event->filp); +} + +static void +__perf_event_exit_task(struct perf_event *child_event, + struct perf_event_context *child_ctx, + struct task_struct *child) +{ + struct perf_event *parent_event; + + update_event_times(child_event); + perf_event_remove_from_context(child_event); + + parent_event = child_event->parent; + /* + * It can happen that parent exits first, and has events + * that are still around due to the child reference. These + * events need to be zapped - but otherwise linger. + */ + if (parent_event) { + sync_child_event(child_event, child); + free_event(child_event); + } +} + +/* + * When a child task exits, feed back event values to parent events. + */ +void perf_event_exit_task(struct task_struct *child) +{ + struct perf_event *child_event, *tmp; + struct perf_event_context *child_ctx; + unsigned long flags; + + if (likely(!child->perf_event_ctxp)) { + perf_event_task(child, NULL, 0); + return; + } + + local_irq_save(flags); + /* + * We can't reschedule here because interrupts are disabled, + * and either child is current or it is a task that can't be + * scheduled, so we are now safe from rescheduling changing + * our context. + */ + child_ctx = child->perf_event_ctxp; + __perf_event_task_sched_out(child_ctx); + + /* + * Take the context lock here so that if find_get_context is + * reading child->perf_event_ctxp, we wait until it has + * incremented the context's refcount before we do put_ctx below. + */ + spin_lock(&child_ctx->lock); + child->perf_event_ctxp = NULL; + /* + * If this context is a clone; unclone it so it can't get + * swapped to another process while we're removing all + * the events from it. + */ + unclone_ctx(child_ctx); + spin_unlock_irqrestore(&child_ctx->lock, flags); + + /* + * Report the task dead after unscheduling the events so that we + * won't get any samples after PERF_RECORD_EXIT. We can however still + * get a few PERF_RECORD_READ events. + */ + perf_event_task(child, child_ctx, 0); + + /* + * We can recurse on the same lock type through: + * + * __perf_event_exit_task() + * sync_child_event() + * fput(parent_event->filp) + * perf_release() + * mutex_lock(&ctx->mutex) + * + * But since its the parent context it won't be the same instance. + */ + mutex_lock_nested(&child_ctx->mutex, SINGLE_DEPTH_NESTING); + +again: + list_for_each_entry_safe(child_event, tmp, &child_ctx->group_list, + group_entry) + __perf_event_exit_task(child_event, child_ctx, child); + + /* + * If the last event was a group event, it will have appended all + * its siblings to the list, but we obtained 'tmp' before that which + * will still point to the list head terminating the iteration. + */ + if (!list_empty(&child_ctx->group_list)) + goto again; + + mutex_unlock(&child_ctx->mutex); + + put_ctx(child_ctx); +} + +/* + * free an unexposed, unused context as created by inheritance by + * init_task below, used by fork() in case of fail. + */ +void perf_event_free_task(struct task_struct *task) +{ + struct perf_event_context *ctx = task->perf_event_ctxp; + struct perf_event *event, *tmp; + + if (!ctx) + return; + + mutex_lock(&ctx->mutex); +again: + list_for_each_entry_safe(event, tmp, &ctx->group_list, group_entry) { + struct perf_event *parent = event->parent; + + if (WARN_ON_ONCE(!parent)) + continue; + + mutex_lock(&parent->child_mutex); + list_del_init(&event->child_list); + mutex_unlock(&parent->child_mutex); + + fput(parent->filp); + + list_del_event(event, ctx); + free_event(event); + } + + if (!list_empty(&ctx->group_list)) + goto again; + + mutex_unlock(&ctx->mutex); + + put_ctx(ctx); +} + +/* + * Initialize the perf_event context in task_struct + */ +int perf_event_init_task(struct task_struct *child) +{ + struct perf_event_context *child_ctx, *parent_ctx; + struct perf_event_context *cloned_ctx; + struct perf_event *event; + struct task_struct *parent = current; + int inherited_all = 1; + int ret = 0; + + child->perf_event_ctxp = NULL; + + mutex_init(&child->perf_event_mutex); + INIT_LIST_HEAD(&child->perf_event_list); + + if (likely(!parent->perf_event_ctxp)) + return 0; + + /* + * This is executed from the parent task context, so inherit + * events that have been marked for cloning. + * First allocate and initialize a context for the child. + */ + + child_ctx = kmalloc(sizeof(struct perf_event_context), GFP_KERNEL); + if (!child_ctx) + return -ENOMEM; + + __perf_event_init_context(child_ctx, child); + child->perf_event_ctxp = child_ctx; + get_task_struct(child); + + /* + * If the parent's context is a clone, pin it so it won't get + * swapped under us. + */ + parent_ctx = perf_pin_task_context(parent); + + /* + * No need to check if parent_ctx != NULL here; since we saw + * it non-NULL earlier, the only reason for it to become NULL + * is if we exit, and since we're currently in the middle of + * a fork we can't be exiting at the same time. + */ + + /* + * Lock the parent list. No need to lock the child - not PID + * hashed yet and not running, so nobody can access it. + */ + mutex_lock(&parent_ctx->mutex); + + /* + * We dont have to disable NMIs - we are only looking at + * the list, not manipulating it: + */ + list_for_each_entry_rcu(event, &parent_ctx->event_list, event_entry) { + if (event != event->group_leader) + continue; + + if (!event->attr.inherit) { + inherited_all = 0; + continue; + } + + ret = inherit_group(event, parent, parent_ctx, + child, child_ctx); + if (ret) { + inherited_all = 0; + break; + } + } + + if (inherited_all) { + /* + * Mark the child context as a clone of the parent + * context, or of whatever the parent is a clone of. + * Note that if the parent is a clone, it could get + * uncloned at any point, but that doesn't matter + * because the list of events and the generation + * count can't have changed since we took the mutex. + */ + cloned_ctx = rcu_dereference(parent_ctx->parent_ctx); + if (cloned_ctx) { + child_ctx->parent_ctx = cloned_ctx; + child_ctx->parent_gen = parent_ctx->parent_gen; + } else { + child_ctx->parent_ctx = parent_ctx; + child_ctx->parent_gen = parent_ctx->generation; + } + get_ctx(child_ctx->parent_ctx); + } + + mutex_unlock(&parent_ctx->mutex); + + perf_unpin_context(parent_ctx); + + return ret; +} + +static void __cpuinit perf_event_init_cpu(int cpu) +{ + struct perf_cpu_context *cpuctx; + + cpuctx = &per_cpu(perf_cpu_context, cpu); + __perf_event_init_context(&cpuctx->ctx, NULL); + + spin_lock(&perf_resource_lock); + cpuctx->max_pertask = perf_max_events - perf_reserved_percpu; + spin_unlock(&perf_resource_lock); + + hw_perf_event_setup(cpu); +} + +#ifdef CONFIG_HOTPLUG_CPU +static void __perf_event_exit_cpu(void *info) +{ + struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); + struct perf_event_context *ctx = &cpuctx->ctx; + struct perf_event *event, *tmp; + + list_for_each_entry_safe(event, tmp, &ctx->group_list, group_entry) + __perf_event_remove_from_context(event); +} +static void perf_event_exit_cpu(int cpu) +{ + struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu); + struct perf_event_context *ctx = &cpuctx->ctx; + + mutex_lock(&ctx->mutex); + smp_call_function_single(cpu, __perf_event_exit_cpu, NULL, 1); + mutex_unlock(&ctx->mutex); +} +#else +static inline void perf_event_exit_cpu(int cpu) { } +#endif + +static int __cpuinit +perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu) +{ + unsigned int cpu = (long)hcpu; + + switch (action) { + + case CPU_UP_PREPARE: + case CPU_UP_PREPARE_FROZEN: + perf_event_init_cpu(cpu); + break; + + case CPU_ONLINE: + case CPU_ONLINE_FROZEN: + hw_perf_event_setup_online(cpu); + break; + + case CPU_DOWN_PREPARE: + case CPU_DOWN_PREPARE_FROZEN: + perf_event_exit_cpu(cpu); + break; + + default: + break; + } + + return NOTIFY_OK; +} + +/* + * This has to have a higher priority than migration_notifier in sched.c. + */ +static struct notifier_block __cpuinitdata perf_cpu_nb = { + .notifier_call = perf_cpu_notify, + .priority = 20, +}; + +void __init perf_event_init(void) +{ + perf_cpu_notify(&perf_cpu_nb, (unsigned long)CPU_UP_PREPARE, + (void *)(long)smp_processor_id()); + perf_cpu_notify(&perf_cpu_nb, (unsigned long)CPU_ONLINE, + (void *)(long)smp_processor_id()); + register_cpu_notifier(&perf_cpu_nb); +} + +static ssize_t perf_show_reserve_percpu(struct sysdev_class *class, char *buf) +{ + return sprintf(buf, "%d\n", perf_reserved_percpu); +} + +static ssize_t +perf_set_reserve_percpu(struct sysdev_class *class, + const char *buf, + size_t count) +{ + struct perf_cpu_context *cpuctx; + unsigned long val; + int err, cpu, mpt; + + err = strict_strtoul(buf, 10, &val); + if (err) + return err; + if (val > perf_max_events) + return -EINVAL; + + spin_lock(&perf_resource_lock); + perf_reserved_percpu = val; + for_each_online_cpu(cpu) { + cpuctx = &per_cpu(perf_cpu_context, cpu); + spin_lock_irq(&cpuctx->ctx.lock); + mpt = min(perf_max_events - cpuctx->ctx.nr_events, + perf_max_events - perf_reserved_percpu); + cpuctx->max_pertask = mpt; + spin_unlock_irq(&cpuctx->ctx.lock); + } + spin_unlock(&perf_resource_lock); + + return count; +} + +static ssize_t perf_show_overcommit(struct sysdev_class *class, char *buf) +{ + return sprintf(buf, "%d\n", perf_overcommit); +} + +static ssize_t +perf_set_overcommit(struct sysdev_class *class, const char *buf, size_t count) +{ + unsigned long val; + int err; + + err = strict_strtoul(buf, 10, &val); + if (err) + return err; + if (val > 1) + return -EINVAL; + + spin_lock(&perf_resource_lock); + perf_overcommit = val; + spin_unlock(&perf_resource_lock); + + return count; +} + +static SYSDEV_CLASS_ATTR( + reserve_percpu, + 0644, + perf_show_reserve_percpu, + perf_set_reserve_percpu + ); + +static SYSDEV_CLASS_ATTR( + overcommit, + 0644, + perf_show_overcommit, + perf_set_overcommit + ); + +static struct attribute *perfclass_attrs[] = { + &attr_reserve_percpu.attr, + &attr_overcommit.attr, + NULL +}; + +static struct attribute_group perfclass_attr_group = { + .attrs = perfclass_attrs, + .name = "perf_events", +}; + +static int __init perf_event_sysfs_init(void) +{ + return sysfs_create_group(&cpu_sysdev_class.kset.kobj, + &perfclass_attr_group); +} +device_initcall(perf_event_sysfs_init); diff --git a/kernel/sched.c b/kernel/sched.c index faf4d463bbff..291c8d213d13 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -39,7 +39,7 @@ #include #include #include -#include +#include #include #include #include @@ -2059,7 +2059,7 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu) if (task_hot(p, old_rq->clock, NULL)) schedstat_inc(p, se.nr_forced2_migrations); #endif - perf_swcounter_event(PERF_COUNT_SW_CPU_MIGRATIONS, + perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS, 1, 1, NULL, 0); } p->se.vruntime -= old_cfsrq->min_vruntime - @@ -2724,7 +2724,7 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev) */ prev_state = prev->state; finish_arch_switch(prev); - perf_counter_task_sched_in(current, cpu_of(rq)); + perf_event_task_sched_in(current, cpu_of(rq)); finish_lock_switch(rq, prev); fire_sched_in_preempt_notifiers(current); @@ -5199,7 +5199,7 @@ void scheduler_tick(void) curr->sched_class->task_tick(rq, curr, 0); spin_unlock(&rq->lock); - perf_counter_task_tick(curr, cpu); + perf_event_task_tick(curr, cpu); #ifdef CONFIG_SMP rq->idle_at_tick = idle_cpu(cpu); @@ -5415,7 +5415,7 @@ need_resched_nonpreemptible: if (likely(prev != next)) { sched_info_switch(prev, next); - perf_counter_task_sched_out(prev, next, cpu); + perf_event_task_sched_out(prev, next, cpu); rq->nr_switches++; rq->curr = next; @@ -7692,7 +7692,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) /* * Register at high priority so that task migration (migrate_all_tasks) * happens before everything else. This has to be lower priority than - * the notifier in the perf_counter subsystem, though. + * the notifier in the perf_event subsystem, though. */ static struct notifier_block __cpuinitdata migration_notifier = { .notifier_call = migration_call, @@ -9549,7 +9549,7 @@ void __init sched_init(void) alloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT); #endif /* SMP */ - perf_counter_init(); + perf_event_init(); scheduler_running = 1; } diff --git a/kernel/sys.c b/kernel/sys.c index b3f1097c76fa..ea5c3bcac881 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -14,7 +14,7 @@ #include #include #include -#include +#include #include #include #include @@ -1511,11 +1511,11 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, case PR_SET_TSC: error = SET_TSC_CTL(arg2); break; - case PR_TASK_PERF_COUNTERS_DISABLE: - error = perf_counter_task_disable(); + case PR_TASK_PERF_EVENTS_DISABLE: + error = perf_event_task_disable(); break; - case PR_TASK_PERF_COUNTERS_ENABLE: - error = perf_counter_task_enable(); + case PR_TASK_PERF_EVENTS_ENABLE: + error = perf_event_task_enable(); break; case PR_GET_TIMERSLACK: error = current->timer_slack_ns; diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index 68320f6b07b5..515bc230ac2a 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c @@ -177,4 +177,4 @@ cond_syscall(sys_eventfd); cond_syscall(sys_eventfd2); /* performance counters: */ -cond_syscall(sys_perf_counter_open); +cond_syscall(sys_perf_event_open); diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 1a631ba684a4..6ba49c7cb128 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -50,7 +50,7 @@ #include #include #include -#include +#include #include #include @@ -964,28 +964,28 @@ static struct ctl_table kern_table[] = { .child = slow_work_sysctls, }, #endif -#ifdef CONFIG_PERF_COUNTERS +#ifdef CONFIG_PERF_EVENTS { .ctl_name = CTL_UNNUMBERED, - .procname = "perf_counter_paranoid", - .data = &sysctl_perf_counter_paranoid, - .maxlen = sizeof(sysctl_perf_counter_paranoid), + .procname = "perf_event_paranoid", + .data = &sysctl_perf_event_paranoid, + .maxlen = sizeof(sysctl_perf_event_paranoid), .mode = 0644, .proc_handler = &proc_dointvec, }, { .ctl_name = CTL_UNNUMBERED, - .procname = "perf_counter_mlock_kb", - .data = &sysctl_perf_counter_mlock, - .maxlen = sizeof(sysctl_perf_counter_mlock), + .procname = "perf_event_mlock_kb", + .data = &sysctl_perf_event_mlock, + .maxlen = sizeof(sysctl_perf_event_mlock), .mode = 0644, .proc_handler = &proc_dointvec, }, { .ctl_name = CTL_UNNUMBERED, - .procname = "perf_counter_max_sample_rate", - .data = &sysctl_perf_counter_sample_rate, - .maxlen = sizeof(sysctl_perf_counter_sample_rate), + .procname = "perf_event_max_sample_rate", + .data = &sysctl_perf_event_sample_rate, + .maxlen = sizeof(sysctl_perf_event_sample_rate), .mode = 0644, .proc_handler = &proc_dointvec, }, diff --git a/kernel/timer.c b/kernel/timer.c index bbb51074680e..811e5c391456 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -37,7 +37,7 @@ #include #include #include -#include +#include #include #include @@ -1187,7 +1187,7 @@ static void run_timer_softirq(struct softirq_action *h) { struct tvec_base *base = __get_cpu_var(tvec_bases); - perf_counter_do_pending(); + perf_event_do_pending(); hrtimer_run_pending(); diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index 8712ce3c6a0e..233f3483ac83 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c @@ -2,7 +2,7 @@ #include #include #include -#include +#include #include #include "trace_output.h" @@ -414,7 +414,7 @@ static void prof_syscall_enter(struct pt_regs *regs, long id) rec->nr = syscall_nr; syscall_get_arguments(current, regs, 0, sys_data->nb_args, (unsigned long *)&rec->args); - perf_tpcounter_event(sys_data->enter_id, 0, 1, rec, size); + perf_tp_event(sys_data->enter_id, 0, 1, rec, size); } while(0); } @@ -476,7 +476,7 @@ static void prof_syscall_exit(struct pt_regs *regs, long ret) rec.nr = syscall_nr; rec.ret = syscall_get_return_value(current, regs); - perf_tpcounter_event(sys_data->exit_id, 0, 1, &rec, sizeof(rec)); + perf_tp_event(sys_data->exit_id, 0, 1, &rec, sizeof(rec)); } int reg_prof_syscall_exit(char *name) diff --git a/mm/mmap.c b/mm/mmap.c index 26892e346d8f..376492ed08f4 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -28,7 +28,7 @@ #include #include #include -#include +#include #include #include @@ -1220,7 +1220,7 @@ munmap_back: if (correct_wcount) atomic_inc(&inode->i_writecount); out: - perf_counter_mmap(vma); + perf_event_mmap(vma); mm->total_vm += len >> PAGE_SHIFT; vm_stat_account(mm, vm_flags, file, len >> PAGE_SHIFT); @@ -2308,7 +2308,7 @@ int install_special_mapping(struct mm_struct *mm, mm->total_vm += len >> PAGE_SHIFT; - perf_counter_mmap(vma); + perf_event_mmap(vma); return 0; } diff --git a/mm/mprotect.c b/mm/mprotect.c index d80311baeb2d..8bc969d8112d 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -23,7 +23,7 @@ #include #include #include -#include +#include #include #include #include @@ -300,7 +300,7 @@ SYSCALL_DEFINE3(mprotect, unsigned long, start, size_t, len, error = mprotect_fixup(vma, &prev, nstart, tmp, newflags); if (error) goto out; - perf_counter_mmap(vma); + perf_event_mmap(vma); nstart = tmp; if (nstart < prev->vm_end) diff --git a/tools/perf/Makefile b/tools/perf/Makefile index 0aba8b6e9c54..b5f1953b6144 100644 --- a/tools/perf/Makefile +++ b/tools/perf/Makefile @@ -318,7 +318,7 @@ export PERL_PATH LIB_FILE=libperf.a -LIB_H += ../../include/linux/perf_counter.h +LIB_H += ../../include/linux/perf_event.h LIB_H += ../../include/linux/rbtree.h LIB_H += ../../include/linux/list.h LIB_H += util/include/linux/list.h diff --git a/tools/perf/builtin-annotate.c b/tools/perf/builtin-annotate.c index 043d85b7e254..1ec741615814 100644 --- a/tools/perf/builtin-annotate.c +++ b/tools/perf/builtin-annotate.c @@ -505,7 +505,7 @@ process_sample_event(event_t *event, unsigned long offset, unsigned long head) return -1; } - if (event->header.misc & PERF_EVENT_MISC_KERNEL) { + if (event->header.misc & PERF_RECORD_MISC_KERNEL) { show = SHOW_KERNEL; level = 'k'; @@ -513,7 +513,7 @@ process_sample_event(event_t *event, unsigned long offset, unsigned long head) dump_printf(" ...... dso: %s\n", dso->name); - } else if (event->header.misc & PERF_EVENT_MISC_USER) { + } else if (event->header.misc & PERF_RECORD_MISC_USER) { show = SHOW_USER; level = '.'; @@ -565,7 +565,7 @@ process_mmap_event(event_t *event, unsigned long offset, unsigned long head) thread = threads__findnew(event->mmap.pid, &threads, &last_match); - dump_printf("%p [%p]: PERF_EVENT_MMAP %d: [%p(%p) @ %p]: %s\n", + dump_printf("%p [%p]: PERF_RECORD_MMAP %d: [%p(%p) @ %p]: %s\n", (void *)(offset + head), (void *)(long)(event->header.size), event->mmap.pid, @@ -575,7 +575,7 @@ process_mmap_event(event_t *event, unsigned long offset, unsigned long head) event->mmap.filename); if (thread == NULL || map == NULL) { - dump_printf("problem processing PERF_EVENT_MMAP, skipping event.\n"); + dump_printf("problem processing PERF_RECORD_MMAP, skipping event.\n"); return 0; } @@ -591,14 +591,14 @@ process_comm_event(event_t *event, unsigned long offset, unsigned long head) struct thread *thread; thread = threads__findnew(event->comm.pid, &threads, &last_match); - dump_printf("%p [%p]: PERF_EVENT_COMM: %s:%d\n", + dump_printf("%p [%p]: PERF_RECORD_COMM: %s:%d\n", (void *)(offset + head), (void *)(long)(event->header.size), event->comm.comm, event->comm.pid); if (thread == NULL || thread__set_comm(thread, event->comm.comm)) { - dump_printf("problem processing PERF_EVENT_COMM, skipping event.\n"); + dump_printf("problem processing PERF_RECORD_COMM, skipping event.\n"); return -1; } total_comm++; @@ -614,7 +614,7 @@ process_fork_event(event_t *event, unsigned long offset, unsigned long head) thread = threads__findnew(event->fork.pid, &threads, &last_match); parent = threads__findnew(event->fork.ppid, &threads, &last_match); - dump_printf("%p [%p]: PERF_EVENT_FORK: %d:%d\n", + dump_printf("%p [%p]: PERF_RECORD_FORK: %d:%d\n", (void *)(offset + head), (void *)(long)(event->header.size), event->fork.pid, event->fork.ppid); @@ -627,7 +627,7 @@ process_fork_event(event_t *event, unsigned long offset, unsigned long head) return 0; if (!thread || !parent || thread__fork(thread, parent)) { - dump_printf("problem processing PERF_EVENT_FORK, skipping event.\n"); + dump_printf("problem processing PERF_RECORD_FORK, skipping event.\n"); return -1; } total_fork++; @@ -639,23 +639,23 @@ static int process_event(event_t *event, unsigned long offset, unsigned long head) { switch (event->header.type) { - case PERF_EVENT_SAMPLE: + case PERF_RECORD_SAMPLE: return process_sample_event(event, offset, head); - case PERF_EVENT_MMAP: + case PERF_RECORD_MMAP: return process_mmap_event(event, offset, head); - case PERF_EVENT_COMM: + case PERF_RECORD_COMM: return process_comm_event(event, offset, head); - case PERF_EVENT_FORK: + case PERF_RECORD_FORK: return process_fork_event(event, offset, head); /* * We dont process them right now but they are fine: */ - case PERF_EVENT_THROTTLE: - case PERF_EVENT_UNTHROTTLE: + case PERF_RECORD_THROTTLE: + case PERF_RECORD_UNTHROTTLE: return 0; default: diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c index 2459e5a22ed8..a5a050af8e7d 100644 --- a/tools/perf/builtin-record.c +++ b/tools/perf/builtin-record.c @@ -77,7 +77,7 @@ static struct mmap_data mmap_array[MAX_NR_CPUS][MAX_COUNTERS]; static unsigned long mmap_read_head(struct mmap_data *md) { - struct perf_counter_mmap_page *pc = md->base; + struct perf_event_mmap_page *pc = md->base; long head; head = pc->data_head; @@ -88,7 +88,7 @@ static unsigned long mmap_read_head(struct mmap_data *md) static void mmap_write_tail(struct mmap_data *md, unsigned long tail) { - struct perf_counter_mmap_page *pc = md->base; + struct perf_event_mmap_page *pc = md->base; /* * ensure all reads are done before we write the tail out. @@ -233,7 +233,7 @@ static pid_t pid_synthesize_comm_event(pid_t pid, int full) } } - comm_ev.header.type = PERF_EVENT_COMM; + comm_ev.header.type = PERF_RECORD_COMM; size = ALIGN(size, sizeof(u64)); comm_ev.header.size = sizeof(comm_ev) - (sizeof(comm_ev.comm) - size); @@ -288,7 +288,7 @@ static void pid_synthesize_mmap_samples(pid_t pid, pid_t tgid) while (1) { char bf[BUFSIZ], *pbf = bf; struct mmap_event mmap_ev = { - .header = { .type = PERF_EVENT_MMAP }, + .header = { .type = PERF_RECORD_MMAP }, }; int n; size_t size; @@ -355,7 +355,7 @@ static void synthesize_all(void) static int group_fd; -static struct perf_header_attr *get_header_attr(struct perf_counter_attr *a, int nr) +static struct perf_header_attr *get_header_attr(struct perf_event_attr *a, int nr) { struct perf_header_attr *h_attr; @@ -371,7 +371,7 @@ static struct perf_header_attr *get_header_attr(struct perf_counter_attr *a, int static void create_counter(int counter, int cpu, pid_t pid) { - struct perf_counter_attr *attr = attrs + counter; + struct perf_event_attr *attr = attrs + counter; struct perf_header_attr *h_attr; int track = !counter; /* only the first counter needs these */ struct { @@ -417,7 +417,7 @@ static void create_counter(int counter, int cpu, pid_t pid) attr->disabled = 1; try_again: - fd[nr_cpu][counter] = sys_perf_counter_open(attr, pid, cpu, group_fd, 0); + fd[nr_cpu][counter] = sys_perf_event_open(attr, pid, cpu, group_fd, 0); if (fd[nr_cpu][counter] < 0) { int err = errno; @@ -444,7 +444,7 @@ try_again: printf("\n"); error("perfcounter syscall returned with %d (%s)\n", fd[nr_cpu][counter], strerror(err)); - die("No CONFIG_PERF_COUNTERS=y kernel support configured?\n"); + die("No CONFIG_PERF_EVENTS=y kernel support configured?\n"); exit(-1); } @@ -478,7 +478,7 @@ try_again: if (multiplex && fd[nr_cpu][counter] != multiplex_fd) { int ret; - ret = ioctl(fd[nr_cpu][counter], PERF_COUNTER_IOC_SET_OUTPUT, multiplex_fd); + ret = ioctl(fd[nr_cpu][counter], PERF_EVENT_IOC_SET_OUTPUT, multiplex_fd); assert(ret != -1); } else { event_array[nr_poll].fd = fd[nr_cpu][counter]; @@ -496,7 +496,7 @@ try_again: } } - ioctl(fd[nr_cpu][counter], PERF_COUNTER_IOC_ENABLE); + ioctl(fd[nr_cpu][counter], PERF_EVENT_IOC_ENABLE); } static void open_counters(int cpu, pid_t pid) @@ -642,7 +642,7 @@ static int __cmd_record(int argc, const char **argv) if (done) { for (i = 0; i < nr_cpu; i++) { for (counter = 0; counter < nr_counters; counter++) - ioctl(fd[i][counter], PERF_COUNTER_IOC_DISABLE); + ioctl(fd[i][counter], PERF_EVENT_IOC_DISABLE); } } } diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c index cdf9a8d27bb9..19669c20088e 100644 --- a/tools/perf/builtin-report.c +++ b/tools/perf/builtin-report.c @@ -1121,7 +1121,7 @@ process_sample_event(event_t *event, unsigned long offset, unsigned long head) more_data += sizeof(u64); } - dump_printf("%p [%p]: PERF_EVENT_SAMPLE (IP, %d): %d/%d: %p period: %Ld\n", + dump_printf("%p [%p]: PERF_RECORD_SAMPLE (IP, %d): %d/%d: %p period: %Ld\n", (void *)(offset + head), (void *)(long)(event->header.size), event->header.misc, @@ -1158,9 +1158,9 @@ process_sample_event(event_t *event, unsigned long offset, unsigned long head) if (comm_list && !strlist__has_entry(comm_list, thread->comm)) return 0; - cpumode = event->header.misc & PERF_EVENT_MISC_CPUMODE_MASK; + cpumode = event->header.misc & PERF_RECORD_MISC_CPUMODE_MASK; - if (cpumode == PERF_EVENT_MISC_KERNEL) { + if (cpumode == PERF_RECORD_MISC_KERNEL) { show = SHOW_KERNEL; level = 'k'; @@ -1168,7 +1168,7 @@ process_sample_event(event_t *event, unsigned long offset, unsigned long head) dump_printf(" ...... dso: %s\n", dso->name); - } else if (cpumode == PERF_EVENT_MISC_USER) { + } else if (cpumode == PERF_RECORD_MISC_USER) { show = SHOW_USER; level = '.'; @@ -1210,7 +1210,7 @@ process_mmap_event(event_t *event, unsigned long offset, unsigned long head) thread = threads__findnew(event->mmap.pid, &threads, &last_match); - dump_printf("%p [%p]: PERF_EVENT_MMAP %d/%d: [%p(%p) @ %p]: %s\n", + dump_printf("%p [%p]: PERF_RECORD_MMAP %d/%d: [%p(%p) @ %p]: %s\n", (void *)(offset + head), (void *)(long)(event->header.size), event->mmap.pid, @@ -1221,7 +1221,7 @@ process_mmap_event(event_t *event, unsigned long offset, unsigned long head) event->mmap.filename); if (thread == NULL || map == NULL) { - dump_printf("problem processing PERF_EVENT_MMAP, skipping event.\n"); + dump_printf("problem processing PERF_RECORD_MMAP, skipping event.\n"); return 0; } @@ -1238,14 +1238,14 @@ process_comm_event(event_t *event, unsigned long offset, unsigned long head) thread = threads__findnew(event->comm.pid, &threads, &last_match); - dump_printf("%p [%p]: PERF_EVENT_COMM: %s:%d\n", + dump_printf("%p [%p]: PERF_RECORD_COMM: %s:%d\n", (void *)(offset + head), (void *)(long)(event->header.size), event->comm.comm, event->comm.pid); if (thread == NULL || thread__set_comm_adjust(thread, event->comm.comm)) { - dump_printf("problem processing PERF_EVENT_COMM, skipping event.\n"); + dump_printf("problem processing PERF_RECORD_COMM, skipping event.\n"); return -1; } total_comm++; @@ -1262,10 +1262,10 @@ process_task_event(event_t *event, unsigned long offset, unsigned long head) thread = threads__findnew(event->fork.pid, &threads, &last_match); parent = threads__findnew(event->fork.ppid, &threads, &last_match); - dump_printf("%p [%p]: PERF_EVENT_%s: (%d:%d):(%d:%d)\n", + dump_printf("%p [%p]: PERF_RECORD_%s: (%d:%d):(%d:%d)\n", (void *)(offset + head), (void *)(long)(event->header.size), - event->header.type == PERF_EVENT_FORK ? "FORK" : "EXIT", + event->header.type == PERF_RECORD_FORK ? "FORK" : "EXIT", event->fork.pid, event->fork.tid, event->fork.ppid, event->fork.ptid); @@ -1276,11 +1276,11 @@ process_task_event(event_t *event, unsigned long offset, unsigned long head) if (thread == parent) return 0; - if (event->header.type == PERF_EVENT_EXIT) + if (event->header.type == PERF_RECORD_EXIT) return 0; if (!thread || !parent || thread__fork(thread, parent)) { - dump_printf("problem processing PERF_EVENT_FORK, skipping event.\n"); + dump_printf("problem processing PERF_RECORD_FORK, skipping event.\n"); return -1; } total_fork++; @@ -1291,7 +1291,7 @@ process_task_event(event_t *event, unsigned long offset, unsigned long head) static int process_lost_event(event_t *event, unsigned long offset, unsigned long head) { - dump_printf("%p [%p]: PERF_EVENT_LOST: id:%Ld: lost:%Ld\n", + dump_printf("%p [%p]: PERF_RECORD_LOST: id:%Ld: lost:%Ld\n", (void *)(offset + head), (void *)(long)(event->header.size), event->lost.id, @@ -1305,7 +1305,7 @@ process_lost_event(event_t *event, unsigned long offset, unsigned long head) static int process_read_event(event_t *event, unsigned long offset, unsigned long head) { - struct perf_counter_attr *attr; + struct perf_event_attr *attr; attr = perf_header__find_attr(event->read.id, header); @@ -1319,7 +1319,7 @@ process_read_event(event_t *event, unsigned long offset, unsigned long head) event->read.value); } - dump_printf("%p [%p]: PERF_EVENT_READ: %d %d %s %Lu\n", + dump_printf("%p [%p]: PERF_RECORD_READ: %d %d %s %Lu\n", (void *)(offset + head), (void *)(long)(event->header.size), event->read.pid, @@ -1337,31 +1337,31 @@ process_event(event_t *event, unsigned long offset, unsigned long head) trace_event(event); switch (event->header.type) { - case PERF_EVENT_SAMPLE: + case PERF_RECORD_SAMPLE: return process_sample_event(event, offset, head); - case PERF_EVENT_MMAP: + case PERF_RECORD_MMAP: return process_mmap_event(event, offset, head); - case PERF_EVENT_COMM: + case PERF_RECORD_COMM: return process_comm_event(event, offset, head); - case PERF_EVENT_FORK: - case PERF_EVENT_EXIT: + case PERF_RECORD_FORK: + case PERF_RECORD_EXIT: return process_task_event(event, offset, head); - case PERF_EVENT_LOST: + case PERF_RECORD_LOST: return process_lost_event(event, offset, head); - case PERF_EVENT_READ: + case PERF_RECORD_READ: return process_read_event(event, offset, head); /* * We dont process them right now but they are fine: */ - case PERF_EVENT_THROTTLE: - case PERF_EVENT_UNTHROTTLE: + case PERF_RECORD_THROTTLE: + case PERF_RECORD_UNTHROTTLE: return 0; default: diff --git a/tools/perf/builtin-sched.c b/tools/perf/builtin-sched.c index 275d79c6627a..ea9c15c0cdfe 100644 --- a/tools/perf/builtin-sched.c +++ b/tools/perf/builtin-sched.c @@ -1573,7 +1573,7 @@ process_sample_event(event_t *event, unsigned long offset, unsigned long head) more_data += sizeof(u64); } - dump_printf("%p [%p]: PERF_EVENT_SAMPLE (IP, %d): %d/%d: %p period: %Ld\n", + dump_printf("%p [%p]: PERF_RECORD_SAMPLE (IP, %d): %d/%d: %p period: %Ld\n", (void *)(offset + head), (void *)(long)(event->header.size), event->header.misc, @@ -1589,9 +1589,9 @@ process_sample_event(event_t *event, unsigned long offset, unsigned long head) return -1; } - cpumode = event->header.misc & PERF_EVENT_MISC_CPUMODE_MASK; + cpumode = event->header.misc & PERF_RECORD_MISC_CPUMODE_MASK; - if (cpumode == PERF_EVENT_MISC_KERNEL) { + if (cpumode == PERF_RECORD_MISC_KERNEL) { show = SHOW_KERNEL; level = 'k'; @@ -1599,7 +1599,7 @@ process_sample_event(event_t *event, unsigned long offset, unsigned long head) dump_printf(" ...... dso: %s\n", dso->name); - } else if (cpumode == PERF_EVENT_MISC_USER) { + } else if (cpumode == PERF_RECORD_MISC_USER) { show = SHOW_USER; level = '.'; @@ -1626,23 +1626,23 @@ process_event(event_t *event, unsigned long offset, unsigned long head) nr_events++; switch (event->header.type) { - case PERF_EVENT_MMAP: + case PERF_RECORD_MMAP: return 0; - case PERF_EVENT_LOST: + case PERF_RECORD_LOST: nr_lost_chunks++; nr_lost_events += event->lost.lost; return 0; - case PERF_EVENT_COMM: + case PERF_RECORD_COMM: return process_comm_event(event, offset, head); - case PERF_EVENT_EXIT ... PERF_EVENT_READ: + case PERF_RECORD_EXIT ... PERF_RECORD_READ: return 0; - case PERF_EVENT_SAMPLE: + case PERF_RECORD_SAMPLE: return process_sample_event(event, offset, head); - case PERF_EVENT_MAX: + case PERF_RECORD_MAX: default: return -1; } diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c index 61b828236c11..16af2d82e858 100644 --- a/tools/perf/builtin-stat.c +++ b/tools/perf/builtin-stat.c @@ -48,7 +48,7 @@ #include #include -static struct perf_counter_attr default_attrs[] = { +static struct perf_event_attr default_attrs[] = { { .type = PERF_TYPE_SOFTWARE, .config = PERF_COUNT_SW_TASK_CLOCK }, { .type = PERF_TYPE_SOFTWARE, .config = PERF_COUNT_SW_CONTEXT_SWITCHES}, @@ -130,11 +130,11 @@ struct stats runtime_cycles_stats; attrs[counter].config == PERF_COUNT_##c) #define ERR_PERF_OPEN \ -"Error: counter %d, sys_perf_counter_open() syscall returned with %d (%s)\n" +"Error: counter %d, sys_perf_event_open() syscall returned with %d (%s)\n" static void create_perf_stat_counter(int counter, int pid) { - struct perf_counter_attr *attr = attrs + counter; + struct perf_event_attr *attr = attrs + counter; if (scale) attr->read_format = PERF_FORMAT_TOTAL_TIME_ENABLED | @@ -144,7 +144,7 @@ static void create_perf_stat_counter(int counter, int pid) unsigned int cpu; for (cpu = 0; cpu < nr_cpus; cpu++) { - fd[cpu][counter] = sys_perf_counter_open(attr, -1, cpu, -1, 0); + fd[cpu][counter] = sys_perf_event_open(attr, -1, cpu, -1, 0); if (fd[cpu][counter] < 0 && verbose) fprintf(stderr, ERR_PERF_OPEN, counter, fd[cpu][counter], strerror(errno)); @@ -154,7 +154,7 @@ static void create_perf_stat_counter(int counter, int pid) attr->disabled = 1; attr->enable_on_exec = 1; - fd[0][counter] = sys_perf_counter_open(attr, pid, -1, -1, 0); + fd[0][counter] = sys_perf_event_open(attr, pid, -1, -1, 0); if (fd[0][counter] < 0 && verbose) fprintf(stderr, ERR_PERF_OPEN, counter, fd[0][counter], strerror(errno)); diff --git a/tools/perf/builtin-timechart.c b/tools/perf/builtin-timechart.c index 600406396274..4405681b3134 100644 --- a/tools/perf/builtin-timechart.c +++ b/tools/perf/builtin-timechart.c @@ -937,21 +937,21 @@ process_event(event_t *event) switch (event->header.type) { - case PERF_EVENT_COMM: + case PERF_RECORD_COMM: return process_comm_event(event); - case PERF_EVENT_FORK: + case PERF_RECORD_FORK: return process_fork_event(event); - case PERF_EVENT_EXIT: + case PERF_RECORD_EXIT: return process_exit_event(event); - case PERF_EVENT_SAMPLE: + case PERF_RECORD_SAMPLE: return queue_sample_event(event); /* * We dont process them right now but they are fine: */ - case PERF_EVENT_MMAP: - case PERF_EVENT_THROTTLE: - case PERF_EVENT_UNTHROTTLE: + case PERF_RECORD_MMAP: + case PERF_RECORD_THROTTLE: + case PERF_RECORD_UNTHROTTLE: return 0; default: diff --git a/tools/perf/builtin-top.c b/tools/perf/builtin-top.c index 4002ccb36750..1ca88896eee4 100644 --- a/tools/perf/builtin-top.c +++ b/tools/perf/builtin-top.c @@ -901,7 +901,7 @@ struct mmap_data { static unsigned int mmap_read_head(struct mmap_data *md) { - struct perf_counter_mmap_page *pc = md->base; + struct perf_event_mmap_page *pc = md->base; int head; head = pc->data_head; @@ -977,9 +977,9 @@ static void mmap_read_counter(struct mmap_data *md) old += size; - if (event->header.type == PERF_EVENT_SAMPLE) { + if (event->header.type == PERF_RECORD_SAMPLE) { int user = - (event->header.misc & PERF_EVENT_MISC_CPUMODE_MASK) == PERF_EVENT_MISC_USER; + (event->header.misc & PERF_RECORD_MISC_CPUMODE_MASK) == PERF_RECORD_MISC_USER; process_event(event->ip.ip, md->counter, user); } } @@ -1005,7 +1005,7 @@ int group_fd; static void start_counter(int i, int counter) { - struct perf_counter_attr *attr; + struct perf_event_attr *attr; int cpu; cpu = profile_cpu; @@ -1019,7 +1019,7 @@ static void start_counter(int i, int counter) attr->inherit = (cpu < 0) && inherit; try_again: - fd[i][counter] = sys_perf_counter_open(attr, target_pid, cpu, group_fd, 0); + fd[i][counter] = sys_perf_event_open(attr, target_pid, cpu, group_fd, 0); if (fd[i][counter] < 0) { int err = errno; @@ -1044,7 +1044,7 @@ try_again: printf("\n"); error("perfcounter syscall returned with %d (%s)\n", fd[i][counter], strerror(err)); - die("No CONFIG_PERF_COUNTERS=y kernel support configured?\n"); + die("No CONFIG_PERF_EVENTS=y kernel support configured?\n"); exit(-1); } assert(fd[i][counter] >= 0); diff --git a/tools/perf/builtin-trace.c b/tools/perf/builtin-trace.c index 914ab366e369..e9d256e2f47d 100644 --- a/tools/perf/builtin-trace.c +++ b/tools/perf/builtin-trace.c @@ -35,14 +35,14 @@ process_comm_event(event_t *event, unsigned long offset, unsigned long head) thread = threads__findnew(event->comm.pid, &threads, &last_match); - dump_printf("%p [%p]: PERF_EVENT_COMM: %s:%d\n", + dump_printf("%p [%p]: PERF_RECORD_COMM: %s:%d\n", (void *)(offset + head), (void *)(long)(event->header.size), event->comm.comm, event->comm.pid); if (thread == NULL || thread__set_comm(thread, event->comm.comm)) { - dump_printf("problem processing PERF_EVENT_COMM, skipping event.\n"); + dump_printf("problem processing PERF_RECORD_COMM, skipping event.\n"); return -1; } total_comm++; @@ -82,7 +82,7 @@ process_sample_event(event_t *event, unsigned long offset, unsigned long head) more_data += sizeof(u64); } - dump_printf("%p [%p]: PERF_EVENT_SAMPLE (IP, %d): %d/%d: %p period: %Ld\n", + dump_printf("%p [%p]: PERF_RECORD_SAMPLE (IP, %d): %d/%d: %p period: %Ld\n", (void *)(offset + head), (void *)(long)(event->header.size), event->header.misc, @@ -98,9 +98,9 @@ process_sample_event(event_t *event, unsigned long offset, unsigned long head) return -1; } - cpumode = event->header.misc & PERF_EVENT_MISC_CPUMODE_MASK; + cpumode = event->header.misc & PERF_RECORD_MISC_CPUMODE_MASK; - if (cpumode == PERF_EVENT_MISC_KERNEL) { + if (cpumode == PERF_RECORD_MISC_KERNEL) { show = SHOW_KERNEL; level = 'k'; @@ -108,7 +108,7 @@ process_sample_event(event_t *event, unsigned long offset, unsigned long head) dump_printf(" ...... dso: %s\n", dso->name); - } else if (cpumode == PERF_EVENT_MISC_USER) { + } else if (cpumode == PERF_RECORD_MISC_USER) { show = SHOW_USER; level = '.'; @@ -146,19 +146,19 @@ process_event(event_t *event, unsigned long offset, unsigned long head) trace_event(event); switch (event->header.type) { - case PERF_EVENT_MMAP ... PERF_EVENT_LOST: + case PERF_RECORD_MMAP ... PERF_RECORD_LOST: return 0; - case PERF_EVENT_COMM: + case PERF_RECORD_COMM: return process_comm_event(event, offset, head); - case PERF_EVENT_EXIT ... PERF_EVENT_READ: + case PERF_RECORD_EXIT ... PERF_RECORD_READ: return 0; - case PERF_EVENT_SAMPLE: + case PERF_RECORD_SAMPLE: return process_sample_event(event, offset, head); - case PERF_EVENT_MAX: + case PERF_RECORD_MAX: default: return -1; } diff --git a/tools/perf/design.txt b/tools/perf/design.txt index f71e0d245cba..f1946d107b10 100644 --- a/tools/perf/design.txt +++ b/tools/perf/design.txt @@ -18,10 +18,10 @@ underlying hardware counters. Performance counters are accessed via special file descriptors. There's one file descriptor per virtual counter used. -The special file descriptor is opened via the perf_counter_open() +The special file descriptor is opened via the perf_event_open() system call: - int sys_perf_counter_open(struct perf_counter_hw_event *hw_event_uptr, + int sys_perf_event_open(struct perf_event_hw_event *hw_event_uptr, pid_t pid, int cpu, int group_fd, unsigned long flags); @@ -32,9 +32,9 @@ can be used to set the blocking mode, etc. Multiple counters can be kept open at a time, and the counters can be poll()ed. -When creating a new counter fd, 'perf_counter_hw_event' is: +When creating a new counter fd, 'perf_event_hw_event' is: -struct perf_counter_hw_event { +struct perf_event_hw_event { /* * The MSB of the config word signifies if the rest contains cpu * specific (raw) counter configuration data, if unset, the next @@ -93,7 +93,7 @@ specified by 'event_id': /* * Generalized performance counter event types, used by the hw_event.event_id - * parameter of the sys_perf_counter_open() syscall: + * parameter of the sys_perf_event_open() syscall: */ enum hw_event_ids { /* @@ -159,7 +159,7 @@ in size. * reads on the counter should return the indicated quantities, * in increasing order of bit value, after the counter value. */ -enum perf_counter_read_format { +enum perf_event_read_format { PERF_FORMAT_TOTAL_TIME_ENABLED = 1, PERF_FORMAT_TOTAL_TIME_RUNNING = 2, }; @@ -178,7 +178,7 @@ interrupt: * Bits that can be set in hw_event.record_type to request information * in the overflow packets. */ -enum perf_counter_record_format { +enum perf_event_record_format { PERF_RECORD_IP = 1U << 0, PERF_RECORD_TID = 1U << 1, PERF_RECORD_TIME = 1U << 2, @@ -228,7 +228,7 @@ these events are recorded in the ring-buffer (see below). The 'comm' bit allows tracking of process comm data on process creation. This too is recorded in the ring-buffer (see below). -The 'pid' parameter to the perf_counter_open() system call allows the +The 'pid' parameter to the perf_event_open() system call allows the counter to be specific to a task: pid == 0: if the pid parameter is zero, the counter is attached to the @@ -258,7 +258,7 @@ The 'flags' parameter is currently unused and must be zero. The 'group_fd' parameter allows counter "groups" to be set up. A counter group has one counter which is the group "leader". The leader -is created first, with group_fd = -1 in the perf_counter_open call +is created first, with group_fd = -1 in the perf_event_open call that creates it. The rest of the group members are created subsequently, with group_fd giving the fd of the group leader. (A single counter on its own is created with group_fd = -1 and is @@ -277,13 +277,13 @@ tracking are logged into a ring-buffer. This ring-buffer is created and accessed through mmap(). The mmap size should be 1+2^n pages, where the first page is a meta-data page -(struct perf_counter_mmap_page) that contains various bits of information such +(struct perf_event_mmap_page) that contains various bits of information such as where the ring-buffer head is. /* * Structure of the page that can be mapped via mmap */ -struct perf_counter_mmap_page { +struct perf_event_mmap_page { __u32 version; /* version number of this structure */ __u32 compat_version; /* lowest version this is compat with */ @@ -317,7 +317,7 @@ struct perf_counter_mmap_page { * Control data for the mmap() data buffer. * * User-space reading this value should issue an rmb(), on SMP capable - * platforms, after reading this value -- see perf_counter_wakeup(). + * platforms, after reading this value -- see perf_event_wakeup(). */ __u32 data_head; /* head in the data section */ }; @@ -327,9 +327,9 @@ NOTE: the hw-counter userspace bits are arch specific and are currently only The following 2^n pages are the ring-buffer which contains events of the form: -#define PERF_EVENT_MISC_KERNEL (1 << 0) -#define PERF_EVENT_MISC_USER (1 << 1) -#define PERF_EVENT_MISC_OVERFLOW (1 << 2) +#define PERF_RECORD_MISC_KERNEL (1 << 0) +#define PERF_RECORD_MISC_USER (1 << 1) +#define PERF_RECORD_MISC_OVERFLOW (1 << 2) struct perf_event_header { __u32 type; @@ -353,8 +353,8 @@ enum perf_event_type { * char filename[]; * }; */ - PERF_EVENT_MMAP = 1, - PERF_EVENT_MUNMAP = 2, + PERF_RECORD_MMAP = 1, + PERF_RECORD_MUNMAP = 2, /* * struct { @@ -364,10 +364,10 @@ enum perf_event_type { * char comm[]; * }; */ - PERF_EVENT_COMM = 3, + PERF_RECORD_COMM = 3, /* - * When header.misc & PERF_EVENT_MISC_OVERFLOW the event_type field + * When header.misc & PERF_RECORD_MISC_OVERFLOW the event_type field * will be PERF_RECORD_* * * struct { @@ -397,7 +397,7 @@ Notification of new events is possible through poll()/select()/epoll() and fcntl() managing signals. Normally a notification is generated for every page filled, however one can -additionally set perf_counter_hw_event.wakeup_events to generate one every +additionally set perf_event_hw_event.wakeup_events to generate one every so many counter overflow events. Future work will include a splice() interface to the ring-buffer. @@ -409,11 +409,11 @@ events but does continue to exist and maintain its count value. An individual counter or counter group can be enabled with - ioctl(fd, PERF_COUNTER_IOC_ENABLE); + ioctl(fd, PERF_EVENT_IOC_ENABLE); or disabled with - ioctl(fd, PERF_COUNTER_IOC_DISABLE); + ioctl(fd, PERF_EVENT_IOC_DISABLE); Enabling or disabling the leader of a group enables or disables the whole group; that is, while the group leader is disabled, none of the @@ -424,16 +424,16 @@ other counter. Additionally, non-inherited overflow counters can use - ioctl(fd, PERF_COUNTER_IOC_REFRESH, nr); + ioctl(fd, PERF_EVENT_IOC_REFRESH, nr); to enable a counter for 'nr' events, after which it gets disabled again. A process can enable or disable all the counter groups that are attached to it, using prctl: - prctl(PR_TASK_PERF_COUNTERS_ENABLE); + prctl(PR_TASK_PERF_EVENTS_ENABLE); - prctl(PR_TASK_PERF_COUNTERS_DISABLE); + prctl(PR_TASK_PERF_EVENTS_DISABLE); This applies to all counters on the current process, whether created by this process or by another, and doesn't affect any counters that @@ -447,11 +447,11 @@ Arch requirements If your architecture does not have hardware performance metrics, you can still use the generic software counters based on hrtimers for sampling. -So to start with, in order to add HAVE_PERF_COUNTERS to your Kconfig, you +So to start with, in order to add HAVE_PERF_EVENTS to your Kconfig, you will need at least this: - - asm/perf_counter.h - a basic stub will suffice at first + - asm/perf_event.h - a basic stub will suffice at first - support for atomic64 types (and associated helper functions) - - set_perf_counter_pending() implemented + - set_perf_event_pending() implemented If your architecture does have hardware capabilities, you can override the -weak stub hw_perf_counter_init() to register hardware counters. +weak stub hw_perf_event_init() to register hardware counters. diff --git a/tools/perf/perf.h b/tools/perf/perf.h index 2abeb20d0bf3..8cc4623afd6f 100644 --- a/tools/perf/perf.h +++ b/tools/perf/perf.h @@ -52,15 +52,15 @@ #include #include -#include "../../include/linux/perf_counter.h" +#include "../../include/linux/perf_event.h" #include "util/types.h" /* - * prctl(PR_TASK_PERF_COUNTERS_DISABLE) will (cheaply) disable all + * prctl(PR_TASK_PERF_EVENTS_DISABLE) will (cheaply) disable all * counters in the current task. */ -#define PR_TASK_PERF_COUNTERS_DISABLE 31 -#define PR_TASK_PERF_COUNTERS_ENABLE 32 +#define PR_TASK_PERF_EVENTS_DISABLE 31 +#define PR_TASK_PERF_EVENTS_ENABLE 32 #ifndef NSEC_PER_SEC # define NSEC_PER_SEC 1000000000ULL @@ -90,12 +90,12 @@ static inline unsigned long long rdclock(void) _min1 < _min2 ? _min1 : _min2; }) static inline int -sys_perf_counter_open(struct perf_counter_attr *attr, +sys_perf_event_open(struct perf_event_attr *attr, pid_t pid, int cpu, int group_fd, unsigned long flags) { attr->size = sizeof(*attr); - return syscall(__NR_perf_counter_open, attr, pid, cpu, + return syscall(__NR_perf_event_open, attr, pid, cpu, group_fd, flags); } diff --git a/tools/perf/util/event.h b/tools/perf/util/event.h index 018d414a09d1..2c9c26d6ded0 100644 --- a/tools/perf/util/event.h +++ b/tools/perf/util/event.h @@ -1,5 +1,5 @@ -#ifndef __PERF_EVENT_H -#define __PERF_EVENT_H +#ifndef __PERF_RECORD_H +#define __PERF_RECORD_H #include "../perf.h" #include "util.h" #include diff --git a/tools/perf/util/header.c b/tools/perf/util/header.c index bb4fca3efcc3..e306857b2c2b 100644 --- a/tools/perf/util/header.c +++ b/tools/perf/util/header.c @@ -9,7 +9,7 @@ /* * Create new perf.data header attribute: */ -struct perf_header_attr *perf_header_attr__new(struct perf_counter_attr *attr) +struct perf_header_attr *perf_header_attr__new(struct perf_event_attr *attr) { struct perf_header_attr *self = malloc(sizeof(*self)); @@ -134,7 +134,7 @@ struct perf_file_section { }; struct perf_file_attr { - struct perf_counter_attr attr; + struct perf_event_attr attr; struct perf_file_section ids; }; @@ -320,7 +320,7 @@ u64 perf_header__sample_type(struct perf_header *header) return type; } -struct perf_counter_attr * +struct perf_event_attr * perf_header__find_attr(u64 id, struct perf_header *header) { int i; diff --git a/tools/perf/util/header.h b/tools/perf/util/header.h index 7b0e84a87179..a0761bc7863c 100644 --- a/tools/perf/util/header.h +++ b/tools/perf/util/header.h @@ -1,12 +1,12 @@ #ifndef _PERF_HEADER_H #define _PERF_HEADER_H -#include "../../../include/linux/perf_counter.h" +#include "../../../include/linux/perf_event.h" #include #include "types.h" struct perf_header_attr { - struct perf_counter_attr attr; + struct perf_event_attr attr; int ids, size; u64 *id; off_t id_offset; @@ -34,11 +34,11 @@ char *perf_header__find_event(u64 id); struct perf_header_attr * -perf_header_attr__new(struct perf_counter_attr *attr); +perf_header_attr__new(struct perf_event_attr *attr); void perf_header_attr__add_id(struct perf_header_attr *self, u64 id); u64 perf_header__sample_type(struct perf_header *header); -struct perf_counter_attr * +struct perf_event_attr * perf_header__find_attr(u64 id, struct perf_header *header); diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c index 89172fd0038b..13ab4b842d49 100644 --- a/tools/perf/util/parse-events.c +++ b/tools/perf/util/parse-events.c @@ -10,7 +10,7 @@ int nr_counters; -struct perf_counter_attr attrs[MAX_COUNTERS]; +struct perf_event_attr attrs[MAX_COUNTERS]; struct event_symbol { u8 type; @@ -48,13 +48,13 @@ static struct event_symbol event_symbols[] = { { CSW(CPU_MIGRATIONS), "cpu-migrations", "migrations" }, }; -#define __PERF_COUNTER_FIELD(config, name) \ - ((config & PERF_COUNTER_##name##_MASK) >> PERF_COUNTER_##name##_SHIFT) +#define __PERF_EVENT_FIELD(config, name) \ + ((config & PERF_EVENT_##name##_MASK) >> PERF_EVENT_##name##_SHIFT) -#define PERF_COUNTER_RAW(config) __PERF_COUNTER_FIELD(config, RAW) -#define PERF_COUNTER_CONFIG(config) __PERF_COUNTER_FIELD(config, CONFIG) -#define PERF_COUNTER_TYPE(config) __PERF_COUNTER_FIELD(config, TYPE) -#define PERF_COUNTER_ID(config) __PERF_COUNTER_FIELD(config, EVENT) +#define PERF_EVENT_RAW(config) __PERF_EVENT_FIELD(config, RAW) +#define PERF_EVENT_CONFIG(config) __PERF_EVENT_FIELD(config, CONFIG) +#define PERF_EVENT_TYPE(config) __PERF_EVENT_FIELD(config, TYPE) +#define PERF_EVENT_ID(config) __PERF_EVENT_FIELD(config, EVENT) static const char *hw_event_names[] = { "cycles", @@ -352,7 +352,7 @@ static int parse_aliases(const char **str, const char *names[][MAX_ALIASES], int } static enum event_result -parse_generic_hw_event(const char **str, struct perf_counter_attr *attr) +parse_generic_hw_event(const char **str, struct perf_event_attr *attr) { const char *s = *str; int cache_type = -1, cache_op = -1, cache_result = -1; @@ -417,7 +417,7 @@ parse_single_tracepoint_event(char *sys_name, const char *evt_name, unsigned int evt_length, char *flags, - struct perf_counter_attr *attr, + struct perf_event_attr *attr, const char **strp) { char evt_path[MAXPATHLEN]; @@ -505,7 +505,7 @@ parse_subsystem_tracepoint_event(char *sys_name, char *flags) static enum event_result parse_tracepoint_event(const char **strp, - struct perf_counter_attr *attr) + struct perf_event_attr *attr) { const char *evt_name; char *flags; @@ -563,7 +563,7 @@ static int check_events(const char *str, unsigned int i) } static enum event_result -parse_symbolic_event(const char **strp, struct perf_counter_attr *attr) +parse_symbolic_event(const char **strp, struct perf_event_attr *attr) { const char *str = *strp; unsigned int i; @@ -582,7 +582,7 @@ parse_symbolic_event(const char **strp, struct perf_counter_attr *attr) } static enum event_result -parse_raw_event(const char **strp, struct perf_counter_attr *attr) +parse_raw_event(const char **strp, struct perf_event_attr *attr) { const char *str = *strp; u64 config; @@ -601,7 +601,7 @@ parse_raw_event(const char **strp, struct perf_counter_attr *attr) } static enum event_result -parse_numeric_event(const char **strp, struct perf_counter_attr *attr) +parse_numeric_event(const char **strp, struct perf_event_attr *attr) { const char *str = *strp; char *endp; @@ -623,7 +623,7 @@ parse_numeric_event(const char **strp, struct perf_counter_attr *attr) } static enum event_result -parse_event_modifier(const char **strp, struct perf_counter_attr *attr) +parse_event_modifier(const char **strp, struct perf_event_attr *attr) { const char *str = *strp; int eu = 1, ek = 1, eh = 1; @@ -656,7 +656,7 @@ parse_event_modifier(const char **strp, struct perf_counter_attr *attr) * Symbolic names are (almost) exactly matched. */ static enum event_result -parse_event_symbols(const char **str, struct perf_counter_attr *attr) +parse_event_symbols(const char **str, struct perf_event_attr *attr) { enum event_result ret; @@ -711,7 +711,7 @@ static void store_event_type(const char *orgname) int parse_events(const struct option *opt __used, const char *str, int unset __used) { - struct perf_counter_attr attr; + struct perf_event_attr attr; enum event_result ret; if (strchr(str, ':')) diff --git a/tools/perf/util/parse-events.h b/tools/perf/util/parse-events.h index 60704c15961f..30c608112845 100644 --- a/tools/perf/util/parse-events.h +++ b/tools/perf/util/parse-events.h @@ -16,7 +16,7 @@ extern struct tracepoint_path *tracepoint_id_to_path(u64 config); extern int nr_counters; -extern struct perf_counter_attr attrs[MAX_COUNTERS]; +extern struct perf_event_attr attrs[MAX_COUNTERS]; extern const char *event_name(int ctr); extern const char *__event_name(int type, u64 config); diff --git a/tools/perf/util/trace-event-info.c b/tools/perf/util/trace-event-info.c index 1fd824c1f1c4..af4b0573b37f 100644 --- a/tools/perf/util/trace-event-info.c +++ b/tools/perf/util/trace-event-info.c @@ -480,12 +480,12 @@ out: } static struct tracepoint_path * -get_tracepoints_path(struct perf_counter_attr *pattrs, int nb_counters) +get_tracepoints_path(struct perf_event_attr *pattrs, int nb_events) { struct tracepoint_path path, *ppath = &path; int i; - for (i = 0; i < nb_counters; i++) { + for (i = 0; i < nb_events; i++) { if (pattrs[i].type != PERF_TYPE_TRACEPOINT) continue; ppath->next = tracepoint_id_to_path(pattrs[i].config); @@ -496,7 +496,7 @@ get_tracepoints_path(struct perf_counter_attr *pattrs, int nb_counters) return path.next; } -void read_tracing_data(struct perf_counter_attr *pattrs, int nb_counters) +void read_tracing_data(struct perf_event_attr *pattrs, int nb_events) { char buf[BUFSIZ]; struct tracepoint_path *tps; @@ -530,7 +530,7 @@ void read_tracing_data(struct perf_counter_attr *pattrs, int nb_counters) page_size = getpagesize(); write_or_die(&page_size, 4); - tps = get_tracepoints_path(pattrs, nb_counters); + tps = get_tracepoints_path(pattrs, nb_events); read_header_files(); read_ftrace_files(tps); diff --git a/tools/perf/util/trace-event.h b/tools/perf/util/trace-event.h index d35ebf1e29ff..693f815c9429 100644 --- a/tools/perf/util/trace-event.h +++ b/tools/perf/util/trace-event.h @@ -240,6 +240,6 @@ unsigned long long raw_field_value(struct event *event, const char *name, void *data); void *raw_field_ptr(struct event *event, const char *name, void *data); -void read_tracing_data(struct perf_counter_attr *pattrs, int nb_counters); +void read_tracing_data(struct perf_event_attr *pattrs, int nb_events); #endif /* _TRACE_EVENTS_H */ -- cgit v1.2.3 From 2003b7af259611312ea132da1f5006ae0b8e47d7 Mon Sep 17 00:00:00 2001 From: Frederic Riss Date: Mon, 21 Sep 2009 08:43:30 +0100 Subject: ARM: 5715/1: Make kprobes unregistration SMP safe ARM kprobes use an illegal instruction to trigger kprobes. In the current implementation, there's a race between the unregistration of a kprobe and the illegal instruction exception handler if they run at the same time on different cores. When reading the value of the undefined instruction, the exception handler might get the original legal instruction as just patched concurrently by arch_disarm_kprobe(). When this happen the kprobe handler won't run, and thus the exception handler will oops because it believe it just hit an undefined instruction in kernel space. The following patch synchronizes the code patching in the kprobes unregistration using stop_machine and thus avoids the above race. Signed-off-by: Frederic RISS Acked-by: Nicolas Pitre Signed-off-by: Russell King --- arch/arm/kernel/kprobes.c | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) (limited to 'arch/arm/kernel') diff --git a/arch/arm/kernel/kprobes.c b/arch/arm/kernel/kprobes.c index f692efddd449..60c62c377fa9 100644 --- a/arch/arm/kernel/kprobes.c +++ b/arch/arm/kernel/kprobes.c @@ -22,6 +22,7 @@ #include #include #include +#include #include #include #include @@ -83,10 +84,24 @@ void __kprobes arch_arm_kprobe(struct kprobe *p) flush_insns(p->addr, 1); } +/* + * The actual disarming is done here on each CPU and synchronized using + * stop_machine. This synchronization is necessary on SMP to avoid removing + * a probe between the moment the 'Undefined Instruction' exception is raised + * and the moment the exception handler reads the faulting instruction from + * memory. + */ +int __kprobes __arch_disarm_kprobe(void *p) +{ + struct kprobe *kp = p; + *kp->addr = kp->opcode; + flush_insns(kp->addr, 1); + return 0; +} + void __kprobes arch_disarm_kprobe(struct kprobe *p) { - *p->addr = p->opcode; - flush_insns(p->addr, 1); + stop_machine(__arch_disarm_kprobe, p, &cpu_online_map); } void __kprobes arch_remove_kprobe(struct kprobe *p) -- cgit v1.2.3 From 56f8ba83a52b9f9e3711eff8e54168ac14aa288f Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Thu, 24 Sep 2009 09:34:49 -0600 Subject: cpumask: use mm_cpumask() wrapper: arm Makes code futureproof against the impending change to mm->cpu_vm_mask. It's also a chance to use the new cpumask_ ops which take a pointer (the older ones are deprecated, but there's no hurry for arch code). Signed-off-by: Rusty Russell --- arch/arm/include/asm/cacheflush.h | 8 ++++---- arch/arm/include/asm/mmu_context.h | 7 ++++--- arch/arm/include/asm/tlbflush.h | 4 ++-- arch/arm/kernel/smp.c | 10 +++++----- arch/arm/mm/context.c | 2 +- arch/arm/mm/flush.c | 10 +++++----- 6 files changed, 21 insertions(+), 20 deletions(-) (limited to 'arch/arm/kernel') diff --git a/arch/arm/include/asm/cacheflush.h b/arch/arm/include/asm/cacheflush.h index 1a711ea8418b..fd03fb63a332 100644 --- a/arch/arm/include/asm/cacheflush.h +++ b/arch/arm/include/asm/cacheflush.h @@ -334,14 +334,14 @@ static inline void outer_flush_range(unsigned long start, unsigned long end) #ifndef CONFIG_CPU_CACHE_VIPT static inline void flush_cache_mm(struct mm_struct *mm) { - if (cpu_isset(smp_processor_id(), mm->cpu_vm_mask)) + if (cpumask_test_cpu(smp_processor_id(), mm_cpumask(mm))) __cpuc_flush_user_all(); } static inline void flush_cache_range(struct vm_area_struct *vma, unsigned long start, unsigned long end) { - if (cpu_isset(smp_processor_id(), vma->vm_mm->cpu_vm_mask)) + if (cpumask_test_cpu(smp_processor_id(), mm_cpumask(vma->vm_mm))) __cpuc_flush_user_range(start & PAGE_MASK, PAGE_ALIGN(end), vma->vm_flags); } @@ -349,7 +349,7 @@ flush_cache_range(struct vm_area_struct *vma, unsigned long start, unsigned long static inline void flush_cache_page(struct vm_area_struct *vma, unsigned long user_addr, unsigned long pfn) { - if (cpu_isset(smp_processor_id(), vma->vm_mm->cpu_vm_mask)) { + if (cpumask_test_cpu(smp_processor_id(), mm_cpumask(vma->vm_mm))) { unsigned long addr = user_addr & PAGE_MASK; __cpuc_flush_user_range(addr, addr + PAGE_SIZE, vma->vm_flags); } @@ -360,7 +360,7 @@ flush_ptrace_access(struct vm_area_struct *vma, struct page *page, unsigned long uaddr, void *kaddr, unsigned long len, int write) { - if (cpu_isset(smp_processor_id(), vma->vm_mm->cpu_vm_mask)) { + if (cpumask_test_cpu(smp_processor_id(), mm_cpumask(vma->vm_mm))) { unsigned long addr = (unsigned long)kaddr; __cpuc_coherent_kern_range(addr, addr + len); } diff --git a/arch/arm/include/asm/mmu_context.h b/arch/arm/include/asm/mmu_context.h index bcdb9291ef0c..de6cefb329dd 100644 --- a/arch/arm/include/asm/mmu_context.h +++ b/arch/arm/include/asm/mmu_context.h @@ -103,14 +103,15 @@ switch_mm(struct mm_struct *prev, struct mm_struct *next, #ifdef CONFIG_SMP /* check for possible thread migration */ - if (!cpus_empty(next->cpu_vm_mask) && !cpu_isset(cpu, next->cpu_vm_mask)) + if (!cpumask_empty(mm_cpumask(next)) && + !cpumask_test_cpu(cpu, mm_cpumask(next))) __flush_icache_all(); #endif - if (!cpu_test_and_set(cpu, next->cpu_vm_mask) || prev != next) { + if (!cpumask_test_and_set_cpu(cpu, mm_cpumask(next)) || prev != next) { check_context(next); cpu_switch_mm(next->pgd, next); if (cache_is_vivt()) - cpu_clear(cpu, prev->cpu_vm_mask); + cpumask_clear_cpu(cpu, mm_cpumask(prev)); } #endif } diff --git a/arch/arm/include/asm/tlbflush.h b/arch/arm/include/asm/tlbflush.h index c964f3fc3bc5..a45ab5dd8255 100644 --- a/arch/arm/include/asm/tlbflush.h +++ b/arch/arm/include/asm/tlbflush.h @@ -350,7 +350,7 @@ static inline void local_flush_tlb_mm(struct mm_struct *mm) if (tlb_flag(TLB_WB)) dsb(); - if (cpu_isset(smp_processor_id(), mm->cpu_vm_mask)) { + if (cpumask_test_cpu(smp_processor_id(), mm_cpumask(mm))) { if (tlb_flag(TLB_V3_FULL)) asm("mcr p15, 0, %0, c6, c0, 0" : : "r" (zero) : "cc"); if (tlb_flag(TLB_V4_U_FULL)) @@ -388,7 +388,7 @@ local_flush_tlb_page(struct vm_area_struct *vma, unsigned long uaddr) if (tlb_flag(TLB_WB)) dsb(); - if (cpu_isset(smp_processor_id(), vma->vm_mm->cpu_vm_mask)) { + if (cpumask_test_cpu(smp_processor_id(), mm_cpumask(vma->vm_mm))) { if (tlb_flag(TLB_V3_PAGE)) asm("mcr p15, 0, %0, c6, c0, 0" : : "r" (uaddr) : "cc"); if (tlb_flag(TLB_V4_U_PAGE)) diff --git a/arch/arm/kernel/smp.c b/arch/arm/kernel/smp.c index de885fd256c5..e0d32770bb3d 100644 --- a/arch/arm/kernel/smp.c +++ b/arch/arm/kernel/smp.c @@ -189,7 +189,7 @@ int __cpuexit __cpu_disable(void) read_lock(&tasklist_lock); for_each_process(p) { if (p->mm) - cpu_clear(cpu, p->mm->cpu_vm_mask); + cpumask_clear_cpu(cpu, mm_cpumask(p->mm)); } read_unlock(&tasklist_lock); @@ -257,7 +257,7 @@ asmlinkage void __cpuinit secondary_start_kernel(void) atomic_inc(&mm->mm_users); atomic_inc(&mm->mm_count); current->active_mm = mm; - cpu_set(cpu, mm->cpu_vm_mask); + cpumask_set_cpu(cpu, mm_cpumask(mm)); cpu_switch_mm(mm->pgd, mm); enter_lazy_tlb(mm, current); local_flush_tlb_all(); @@ -643,7 +643,7 @@ void flush_tlb_all(void) void flush_tlb_mm(struct mm_struct *mm) { if (tlb_ops_need_broadcast()) - on_each_cpu_mask(ipi_flush_tlb_mm, mm, 1, &mm->cpu_vm_mask); + on_each_cpu_mask(ipi_flush_tlb_mm, mm, 1, mm_cpumask(mm)); else local_flush_tlb_mm(mm); } @@ -654,7 +654,7 @@ void flush_tlb_page(struct vm_area_struct *vma, unsigned long uaddr) struct tlb_args ta; ta.ta_vma = vma; ta.ta_start = uaddr; - on_each_cpu_mask(ipi_flush_tlb_page, &ta, 1, &vma->vm_mm->cpu_vm_mask); + on_each_cpu_mask(ipi_flush_tlb_page, &ta, 1, mm_cpumask(vma->vm_mm)); } else local_flush_tlb_page(vma, uaddr); } @@ -677,7 +677,7 @@ void flush_tlb_range(struct vm_area_struct *vma, ta.ta_vma = vma; ta.ta_start = start; ta.ta_end = end; - on_each_cpu_mask(ipi_flush_tlb_range, &ta, 1, &vma->vm_mm->cpu_vm_mask); + on_each_cpu_mask(ipi_flush_tlb_range, &ta, 1, mm_cpumask(vma->vm_mm)); } else local_flush_tlb_range(vma, start, end); } diff --git a/arch/arm/mm/context.c b/arch/arm/mm/context.c index fc84fcc74380..6bda76a43199 100644 --- a/arch/arm/mm/context.c +++ b/arch/arm/mm/context.c @@ -59,6 +59,6 @@ void __new_context(struct mm_struct *mm) } spin_unlock(&cpu_asid_lock); - mm->cpu_vm_mask = cpumask_of_cpu(smp_processor_id()); + cpumask_copy(mm_cpumask(mm), cpumask_of(smp_processor_id())); mm->context.id = asid; } diff --git a/arch/arm/mm/flush.c b/arch/arm/mm/flush.c index 575f3ad722e7..b27942909b23 100644 --- a/arch/arm/mm/flush.c +++ b/arch/arm/mm/flush.c @@ -50,7 +50,7 @@ static void flush_pfn_alias(unsigned long pfn, unsigned long vaddr) void flush_cache_mm(struct mm_struct *mm) { if (cache_is_vivt()) { - if (cpu_isset(smp_processor_id(), mm->cpu_vm_mask)) + if (cpumask_test_cpu(smp_processor_id(), mm_cpumask(mm))) __cpuc_flush_user_all(); return; } @@ -73,7 +73,7 @@ void flush_cache_mm(struct mm_struct *mm) void flush_cache_range(struct vm_area_struct *vma, unsigned long start, unsigned long end) { if (cache_is_vivt()) { - if (cpu_isset(smp_processor_id(), vma->vm_mm->cpu_vm_mask)) + if (cpumask_test_cpu(smp_processor_id(), mm_cpumask(vma->vm_mm))) __cpuc_flush_user_range(start & PAGE_MASK, PAGE_ALIGN(end), vma->vm_flags); return; @@ -97,7 +97,7 @@ void flush_cache_range(struct vm_area_struct *vma, unsigned long start, unsigned void flush_cache_page(struct vm_area_struct *vma, unsigned long user_addr, unsigned long pfn) { if (cache_is_vivt()) { - if (cpu_isset(smp_processor_id(), vma->vm_mm->cpu_vm_mask)) { + if (cpumask_test_cpu(smp_processor_id(), mm_cpumask(vma->vm_mm))) { unsigned long addr = user_addr & PAGE_MASK; __cpuc_flush_user_range(addr, addr + PAGE_SIZE, vma->vm_flags); } @@ -113,7 +113,7 @@ void flush_ptrace_access(struct vm_area_struct *vma, struct page *page, unsigned long len, int write) { if (cache_is_vivt()) { - if (cpu_isset(smp_processor_id(), vma->vm_mm->cpu_vm_mask)) { + if (cpumask_test_cpu(smp_processor_id(), mm_cpumask(vma->vm_mm))) { unsigned long addr = (unsigned long)kaddr; __cpuc_coherent_kern_range(addr, addr + len); } @@ -126,7 +126,7 @@ void flush_ptrace_access(struct vm_area_struct *vma, struct page *page, } /* VIPT non-aliasing cache */ - if (cpu_isset(smp_processor_id(), vma->vm_mm->cpu_vm_mask) && + if (cpumask_test_cpu(smp_processor_id(), mm_cpumask(vma->vm_mm)) && vma->vm_flags & VM_EXEC) { unsigned long addr = (unsigned long)kaddr; /* only flushing the kernel mapping on non-aliasing VIPT */ -- cgit v1.2.3 From 2bcd57ab61e7cabed626226a3771617981c11ce1 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Thu, 24 Sep 2009 04:22:25 +0400 Subject: headers: utsname.h redux * remove asm/atomic.h inclusion from linux/utsname.h -- not needed after kref conversion * remove linux/utsname.h inclusion from files which do not need it NOTE: it looks like fs/binfmt_elf.c do not need utsname.h, however due to some personality stuff it _is_ needed -- cowardly leave ELF-related headers and files alone. Signed-off-by: Alexey Dobriyan Signed-off-by: Linus Torvalds --- arch/alpha/kernel/process.c | 1 - arch/arm/kernel/sys_arm.c | 1 - arch/frv/kernel/sys_frv.c | 1 - arch/h8300/kernel/sys_h8300.c | 1 - arch/m68k/kernel/sys_m68k.c | 1 - arch/m68knommu/kernel/sys_m68k.c | 1 - arch/microblaze/kernel/sys_microblaze.c | 1 - arch/mn10300/kernel/sys_mn10300.c | 1 - arch/parisc/kernel/sys_parisc32.c | 1 - arch/powerpc/kernel/setup-common.c | 1 - arch/powerpc/kernel/sys_ppc32.c | 1 - arch/s390/kernel/compat_linux.c | 1 - arch/s390/kernel/process.c | 1 - arch/sh/kernel/sys_sh32.c | 1 - arch/sh/kernel/sys_sh64.c | 1 - arch/sparc/kernel/sys_sparc32.c | 1 - arch/sparc/kernel/systbls.h | 3 ++- arch/x86/kernel/dumpstack_32.c | 1 - arch/x86/kernel/dumpstack_64.c | 1 - arch/x86/kernel/traps.c | 1 - drivers/media/video/usbvision/usbvision-core.c | 1 - drivers/media/video/usbvision/usbvision-i2c.c | 1 - drivers/media/video/usbvision/usbvision-video.c | 1 - drivers/s390/char/zcore.c | 1 - drivers/usb/gadget/f_loopback.c | 1 - drivers/usb/gadget/f_obex.c | 1 - drivers/usb/gadget/f_sourcesink.c | 1 - drivers/usb/gadget/u_audio.c | 1 - drivers/usb/gadget/u_ether.c | 1 - fs/gfs2/ops_inode.c | 1 - fs/lockd/xdr.c | 1 - fs/lockd/xdr4.c | 1 - fs/nfs/nfs2xdr.c | 1 - fs/nfs/nfs3proc.c | 1 - fs/nfs/nfs3xdr.c | 1 - fs/nfs/nfs4proc.c | 1 - fs/nfs/nfs4xdr.c | 1 - fs/nfs/proc.c | 1 - fs/nfsd/nfs4idmap.c | 1 - fs/ocfs2/dlm/dlmast.c | 1 - fs/ocfs2/dlm/dlmconvert.c | 1 - fs/ocfs2/dlm/dlmdebug.c | 1 - fs/ocfs2/dlm/dlmdomain.c | 1 - fs/ocfs2/dlm/dlmlock.c | 1 - fs/ocfs2/dlm/dlmmaster.c | 1 - fs/ocfs2/dlm/dlmrecovery.c | 1 - fs/ocfs2/dlm/dlmthread.c | 1 - fs/ocfs2/dlm/dlmunlock.c | 1 - fs/ocfs2/super.c | 1 - fs/ocfs2/symlink.c | 1 - include/linux/utsname.h | 1 - init/main.c | 1 - kernel/power/swap.c | 1 - kernel/sysctl.c | 1 - kernel/uid16.c | 1 - net/sunrpc/auth_null.c | 1 - 56 files changed, 2 insertions(+), 56 deletions(-) (limited to 'arch/arm/kernel') diff --git a/arch/alpha/kernel/process.c b/arch/alpha/kernel/process.c index 3a2fb7a02db4..289039bb6bb2 100644 --- a/arch/alpha/kernel/process.c +++ b/arch/alpha/kernel/process.c @@ -19,7 +19,6 @@ #include #include #include -#include #include #include #include diff --git a/arch/arm/kernel/sys_arm.c b/arch/arm/kernel/sys_arm.c index b3ec641b5cf8..78ecaac65206 100644 --- a/arch/arm/kernel/sys_arm.c +++ b/arch/arm/kernel/sys_arm.c @@ -25,7 +25,6 @@ #include #include #include -#include #include #include diff --git a/arch/frv/kernel/sys_frv.c b/arch/frv/kernel/sys_frv.c index baadc97f8627..2b6b5289cdcc 100644 --- a/arch/frv/kernel/sys_frv.c +++ b/arch/frv/kernel/sys_frv.c @@ -21,7 +21,6 @@ #include #include #include -#include #include #include diff --git a/arch/h8300/kernel/sys_h8300.c b/arch/h8300/kernel/sys_h8300.c index 2745656dcc52..8cb5d73a0e35 100644 --- a/arch/h8300/kernel/sys_h8300.c +++ b/arch/h8300/kernel/sys_h8300.c @@ -17,7 +17,6 @@ #include #include #include -#include #include #include diff --git a/arch/m68k/kernel/sys_m68k.c b/arch/m68k/kernel/sys_m68k.c index 7f54efaf60bb..7deb402bfc75 100644 --- a/arch/m68k/kernel/sys_m68k.c +++ b/arch/m68k/kernel/sys_m68k.c @@ -20,7 +20,6 @@ #include #include #include -#include #include #include diff --git a/arch/m68knommu/kernel/sys_m68k.c b/arch/m68knommu/kernel/sys_m68k.c index 700281638629..efdd090778a3 100644 --- a/arch/m68knommu/kernel/sys_m68k.c +++ b/arch/m68knommu/kernel/sys_m68k.c @@ -17,7 +17,6 @@ #include #include #include -#include #include #include diff --git a/arch/microblaze/kernel/sys_microblaze.c b/arch/microblaze/kernel/sys_microblaze.c index b96f1682bb24..07cabed4b947 100644 --- a/arch/microblaze/kernel/sys_microblaze.c +++ b/arch/microblaze/kernel/sys_microblaze.c @@ -23,7 +23,6 @@ #include #include #include -#include #include #include #include diff --git a/arch/mn10300/kernel/sys_mn10300.c b/arch/mn10300/kernel/sys_mn10300.c index 3e52a1054327..8ca5af00334c 100644 --- a/arch/mn10300/kernel/sys_mn10300.c +++ b/arch/mn10300/kernel/sys_mn10300.c @@ -19,7 +19,6 @@ #include #include #include -#include #include #include diff --git a/arch/parisc/kernel/sys_parisc32.c b/arch/parisc/kernel/sys_parisc32.c index 92a0acaa0d12..561388b17c91 100644 --- a/arch/parisc/kernel/sys_parisc32.c +++ b/arch/parisc/kernel/sys_parisc32.c @@ -18,7 +18,6 @@ #include #include #include -#include #include #include #include diff --git a/arch/powerpc/kernel/setup-common.c b/arch/powerpc/kernel/setup-common.c index 1d5570a1e456..74cd1a7d0d4b 100644 --- a/arch/powerpc/kernel/setup-common.c +++ b/arch/powerpc/kernel/setup-common.c @@ -24,7 +24,6 @@ #include #include #include -#include #include #include #include diff --git a/arch/powerpc/kernel/sys_ppc32.c b/arch/powerpc/kernel/sys_ppc32.c index 1cc5e9e5da96..b97c2d67f4ac 100644 --- a/arch/powerpc/kernel/sys_ppc32.c +++ b/arch/powerpc/kernel/sys_ppc32.c @@ -22,7 +22,6 @@ #include #include #include -#include #include #include #include diff --git a/arch/s390/kernel/compat_linux.c b/arch/s390/kernel/compat_linux.c index 5519cb745106..0debcec23a39 100644 --- a/arch/s390/kernel/compat_linux.c +++ b/arch/s390/kernel/compat_linux.c @@ -24,7 +24,6 @@ #include #include #include -#include #include #include #include diff --git a/arch/s390/kernel/process.c b/arch/s390/kernel/process.c index 59fe6ecc6ed3..5417eb57271a 100644 --- a/arch/s390/kernel/process.c +++ b/arch/s390/kernel/process.c @@ -27,7 +27,6 @@ #include #include #include -#include #include #include #include diff --git a/arch/sh/kernel/sys_sh32.c b/arch/sh/kernel/sys_sh32.c index 63ba12836eae..eb68bfdd86e6 100644 --- a/arch/sh/kernel/sys_sh32.c +++ b/arch/sh/kernel/sys_sh32.c @@ -9,7 +9,6 @@ #include #include #include -#include #include #include #include diff --git a/arch/sh/kernel/sys_sh64.c b/arch/sh/kernel/sys_sh64.c index 91fb8445a5a0..287235768bc5 100644 --- a/arch/sh/kernel/sys_sh64.c +++ b/arch/sh/kernel/sys_sh64.c @@ -23,7 +23,6 @@ #include #include #include -#include #include #include #include diff --git a/arch/sparc/kernel/sys_sparc32.c b/arch/sparc/kernel/sys_sparc32.c index f5000a460c05..04e28b2671c8 100644 --- a/arch/sparc/kernel/sys_sparc32.c +++ b/arch/sparc/kernel/sys_sparc32.c @@ -16,7 +16,6 @@ #include #include #include -#include #include #include #include diff --git a/arch/sparc/kernel/systbls.h b/arch/sparc/kernel/systbls.h index 15c2d752b2bc..a63c5d2d9849 100644 --- a/arch/sparc/kernel/systbls.h +++ b/arch/sparc/kernel/systbls.h @@ -3,10 +3,11 @@ #include #include -#include #include #include +struct new_utsname; + extern asmlinkage unsigned long sys_getpagesize(void); extern asmlinkage unsigned long sparc_brk(unsigned long brk); extern asmlinkage long sparc_pipe(struct pt_regs *regs); diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c index bca5fba91c9e..f7dd2a7c3bf4 100644 --- a/arch/x86/kernel/dumpstack_32.c +++ b/arch/x86/kernel/dumpstack_32.c @@ -5,7 +5,6 @@ #include #include #include -#include #include #include #include diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c index 54b0a3276766..a071e6be177e 100644 --- a/arch/x86/kernel/dumpstack_64.c +++ b/arch/x86/kernel/dumpstack_64.c @@ -5,7 +5,6 @@ #include #include #include -#include #include #include #include diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 9346e102338d..a665c71352b8 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -14,7 +14,6 @@ #include #include #include -#include #include #include #include diff --git a/drivers/media/video/usbvision/usbvision-core.c b/drivers/media/video/usbvision/usbvision-core.c index 6ba16abeebdd..e0f91e4ab653 100644 --- a/drivers/media/video/usbvision/usbvision-core.c +++ b/drivers/media/video/usbvision/usbvision-core.c @@ -28,7 +28,6 @@ #include #include #include -#include #include #include #include diff --git a/drivers/media/video/usbvision/usbvision-i2c.c b/drivers/media/video/usbvision/usbvision-i2c.c index f97fd06d5948..c19f51dba2ee 100644 --- a/drivers/media/video/usbvision/usbvision-i2c.c +++ b/drivers/media/video/usbvision/usbvision-i2c.c @@ -28,7 +28,6 @@ #include #include #include -#include #include #include #include diff --git a/drivers/media/video/usbvision/usbvision-video.c b/drivers/media/video/usbvision/usbvision-video.c index 90d9b5c0e9a7..a2a50d608a3f 100644 --- a/drivers/media/video/usbvision/usbvision-video.c +++ b/drivers/media/video/usbvision/usbvision-video.c @@ -52,7 +52,6 @@ #include #include #include -#include #include #include #include diff --git a/drivers/s390/char/zcore.c b/drivers/s390/char/zcore.c index c431198bdbc4..82daa3c1dc9c 100644 --- a/drivers/s390/char/zcore.c +++ b/drivers/s390/char/zcore.c @@ -14,7 +14,6 @@ #include #include -#include #include #include #include diff --git a/drivers/usb/gadget/f_loopback.c b/drivers/usb/gadget/f_loopback.c index eb6ddfc20857..6cb29d3df575 100644 --- a/drivers/usb/gadget/f_loopback.c +++ b/drivers/usb/gadget/f_loopback.c @@ -22,7 +22,6 @@ /* #define VERBOSE_DEBUG */ #include -#include #include #include "g_zero.h" diff --git a/drivers/usb/gadget/f_obex.c b/drivers/usb/gadget/f_obex.c index 46d6266f30ec..b4a3ba654ea5 100644 --- a/drivers/usb/gadget/f_obex.c +++ b/drivers/usb/gadget/f_obex.c @@ -24,7 +24,6 @@ /* #define VERBOSE_DEBUG */ #include -#include #include #include "u_serial.h" diff --git a/drivers/usb/gadget/f_sourcesink.c b/drivers/usb/gadget/f_sourcesink.c index bffe91d525f9..09cba273d2db 100644 --- a/drivers/usb/gadget/f_sourcesink.c +++ b/drivers/usb/gadget/f_sourcesink.c @@ -22,7 +22,6 @@ /* #define VERBOSE_DEBUG */ #include -#include #include #include "g_zero.h" diff --git a/drivers/usb/gadget/u_audio.c b/drivers/usb/gadget/u_audio.c index b5200d551458..8252595d619d 100644 --- a/drivers/usb/gadget/u_audio.c +++ b/drivers/usb/gadget/u_audio.c @@ -10,7 +10,6 @@ */ #include -#include #include #include #include diff --git a/drivers/usb/gadget/u_ether.c b/drivers/usb/gadget/u_ether.c index f8751ff863cd..2fc02bd95848 100644 --- a/drivers/usb/gadget/u_ether.c +++ b/drivers/usb/gadget/u_ether.c @@ -23,7 +23,6 @@ /* #define VERBOSE_DEBUG */ #include -#include #include #include #include diff --git a/fs/gfs2/ops_inode.c b/fs/gfs2/ops_inode.c index c3ac18054057..247436c10deb 100644 --- a/fs/gfs2/ops_inode.c +++ b/fs/gfs2/ops_inode.c @@ -12,7 +12,6 @@ #include #include #include -#include #include #include #include diff --git a/fs/lockd/xdr.c b/fs/lockd/xdr.c index 0336f2beacde..b583ab0a4cbb 100644 --- a/fs/lockd/xdr.c +++ b/fs/lockd/xdr.c @@ -8,7 +8,6 @@ #include #include -#include #include #include diff --git a/fs/lockd/xdr4.c b/fs/lockd/xdr4.c index e1d528653192..ad9dbbc9145d 100644 --- a/fs/lockd/xdr4.c +++ b/fs/lockd/xdr4.c @@ -9,7 +9,6 @@ #include #include -#include #include #include diff --git a/fs/nfs/nfs2xdr.c b/fs/nfs/nfs2xdr.c index c862c9340f9a..5e078b222b4e 100644 --- a/fs/nfs/nfs2xdr.c +++ b/fs/nfs/nfs2xdr.c @@ -13,7 +13,6 @@ #include #include #include -#include #include #include #include diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c index ee6a13f05443..3f8881d1a050 100644 --- a/fs/nfs/nfs3proc.c +++ b/fs/nfs/nfs3proc.c @@ -7,7 +7,6 @@ */ #include -#include #include #include #include diff --git a/fs/nfs/nfs3xdr.c b/fs/nfs/nfs3xdr.c index 35869a4921f1..5fe5492fbd29 100644 --- a/fs/nfs/nfs3xdr.c +++ b/fs/nfs/nfs3xdr.c @@ -10,7 +10,6 @@ #include #include #include -#include #include #include #include diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index be6544aef41f..ed7c269e2514 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -36,7 +36,6 @@ */ #include -#include #include #include #include diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c index cfc30d362f94..83ad47cbdd8a 100644 --- a/fs/nfs/nfs4xdr.c +++ b/fs/nfs/nfs4xdr.c @@ -39,7 +39,6 @@ #include #include #include -#include #include #include #include diff --git a/fs/nfs/proc.c b/fs/nfs/proc.c index 7be72d90d49d..ef583854d8d0 100644 --- a/fs/nfs/proc.c +++ b/fs/nfs/proc.c @@ -32,7 +32,6 @@ #include #include #include -#include #include #include #include diff --git a/fs/nfsd/nfs4idmap.c b/fs/nfsd/nfs4idmap.c index cdfa86fa1471..ba2c199592fd 100644 --- a/fs/nfsd/nfs4idmap.c +++ b/fs/nfsd/nfs4idmap.c @@ -38,7 +38,6 @@ #include #include -#include #include #include #include diff --git a/fs/ocfs2/dlm/dlmast.c b/fs/ocfs2/dlm/dlmast.c index 81eff8e58322..01cf8cc3d286 100644 --- a/fs/ocfs2/dlm/dlmast.c +++ b/fs/ocfs2/dlm/dlmast.c @@ -30,7 +30,6 @@ #include #include #include -#include #include #include #include diff --git a/fs/ocfs2/dlm/dlmconvert.c b/fs/ocfs2/dlm/dlmconvert.c index 75997b4deaf3..ca96bce50e18 100644 --- a/fs/ocfs2/dlm/dlmconvert.c +++ b/fs/ocfs2/dlm/dlmconvert.c @@ -30,7 +30,6 @@ #include #include #include -#include #include #include #include diff --git a/fs/ocfs2/dlm/dlmdebug.c b/fs/ocfs2/dlm/dlmdebug.c index c5c88124096d..ca46002ec10e 100644 --- a/fs/ocfs2/dlm/dlmdebug.c +++ b/fs/ocfs2/dlm/dlmdebug.c @@ -27,7 +27,6 @@ #include #include #include -#include #include #include #include diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c index 4d9e6b288dd8..0334000676d3 100644 --- a/fs/ocfs2/dlm/dlmdomain.c +++ b/fs/ocfs2/dlm/dlmdomain.c @@ -28,7 +28,6 @@ #include #include #include -#include #include #include #include diff --git a/fs/ocfs2/dlm/dlmlock.c b/fs/ocfs2/dlm/dlmlock.c index 83a9f2972ac8..437698e9465f 100644 --- a/fs/ocfs2/dlm/dlmlock.c +++ b/fs/ocfs2/dlm/dlmlock.c @@ -30,7 +30,6 @@ #include #include #include -#include #include #include #include diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c index f8b653fcd4dd..83bcaf266b35 100644 --- a/fs/ocfs2/dlm/dlmmaster.c +++ b/fs/ocfs2/dlm/dlmmaster.c @@ -30,7 +30,6 @@ #include #include #include -#include #include #include #include diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c index 43e6e3280569..d9fa3d22e17c 100644 --- a/fs/ocfs2/dlm/dlmrecovery.c +++ b/fs/ocfs2/dlm/dlmrecovery.c @@ -30,7 +30,6 @@ #include #include #include -#include #include #include #include diff --git a/fs/ocfs2/dlm/dlmthread.c b/fs/ocfs2/dlm/dlmthread.c index 98569e86c613..52ec020ea78b 100644 --- a/fs/ocfs2/dlm/dlmthread.c +++ b/fs/ocfs2/dlm/dlmthread.c @@ -30,7 +30,6 @@ #include #include #include -#include #include #include #include diff --git a/fs/ocfs2/dlm/dlmunlock.c b/fs/ocfs2/dlm/dlmunlock.c index 756f5b0998e0..00f53b2aea76 100644 --- a/fs/ocfs2/dlm/dlmunlock.c +++ b/fs/ocfs2/dlm/dlmunlock.c @@ -30,7 +30,6 @@ #include #include #include -#include #include #include #include diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index 24feb449a1dc..4cc3c890a2cd 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c @@ -28,7 +28,6 @@ #include #include #include -#include #include #include #include diff --git a/fs/ocfs2/symlink.c b/fs/ocfs2/symlink.c index 579dd1b1110f..e3421030a69f 100644 --- a/fs/ocfs2/symlink.c +++ b/fs/ocfs2/symlink.c @@ -38,7 +38,6 @@ #include #include #include -#include #include #define MLOG_MASK_PREFIX ML_NAMEI diff --git a/include/linux/utsname.h b/include/linux/utsname.h index 3656b300de3a..69f39974c041 100644 --- a/include/linux/utsname.h +++ b/include/linux/utsname.h @@ -36,7 +36,6 @@ struct new_utsname { #include #include #include -#include struct uts_namespace { struct kref kref; diff --git a/init/main.c b/init/main.c index 6107223124e4..76961db298f0 100644 --- a/init/main.c +++ b/init/main.c @@ -18,7 +18,6 @@ #include #include #include -#include #include #include #include diff --git a/kernel/power/swap.c b/kernel/power/swap.c index 8ba052c86d48..b101cdc4df3f 100644 --- a/kernel/power/swap.c +++ b/kernel/power/swap.c @@ -13,7 +13,6 @@ #include #include -#include #include #include #include diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 0dfaa47d7cb6..7f4f57bea4ce 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -26,7 +26,6 @@ #include #include #include -#include #include #include #include diff --git a/kernel/uid16.c b/kernel/uid16.c index 0314501688b9..419209893d87 100644 --- a/kernel/uid16.c +++ b/kernel/uid16.c @@ -4,7 +4,6 @@ */ #include -#include #include #include #include diff --git a/net/sunrpc/auth_null.c b/net/sunrpc/auth_null.c index c70dd7f5258e..1db618f56ecb 100644 --- a/net/sunrpc/auth_null.c +++ b/net/sunrpc/auth_null.c @@ -8,7 +8,6 @@ #include #include -#include #include #ifdef RPC_DEBUG -- cgit v1.2.3 From 9e6ec39becb02bda776eebf12c0677910d54b848 Mon Sep 17 00:00:00 2001 From: Nicolas Pitre Date: Fri, 25 Sep 2009 16:28:02 -0400 Subject: make Linux bootable on ARM again MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Commit 200b812d00 "Clear the exclusive monitor when returning from an exception" broke the vast majority of ARM systems in the wild which are still pre ARMv6. The kernel is crashing on the first occurrence of an exception due to the removal of the actual return instruction for them. Let's add it back. Signed-off-by: Nicolas Pitre Acked-by: Uwe Kleine-König Signed-off-by: Linus Torvalds --- arch/arm/kernel/entry-header.S | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch/arm/kernel') diff --git a/arch/arm/kernel/entry-header.S b/arch/arm/kernel/entry-header.S index e17e3c30d957..ac34c0d9384b 100644 --- a/arch/arm/kernel/entry-header.S +++ b/arch/arm/kernel/entry-header.S @@ -83,6 +83,8 @@ ldr r0, [sp] strex r1, r2, [sp] @ clear the exclusive monitor ldmib sp, {r1 - pc}^ @ load r1 - pc, cpsr +#else + ldmia sp, {r0 - pc}^ @ load r0 - pc, cpsr #endif .endm -- cgit v1.2.3 From e616c591405c168f6dc3dfd1221e105adfe49b8d Mon Sep 17 00:00:00 2001 From: Russell King Date: Sun, 27 Sep 2009 20:55:43 +0100 Subject: ARM: Don't allow highmem on SMP platforms without h/w TLB ops broadcast We suffer an unfortunate combination of "features" which makes highmem support on platforms without hardware TLB maintainence broadcast difficult: - we need kmap_high_get() support for DMA cache coherence - this requires kmap_high() to take a spinlock with IRQs disabled - kmap_high() occasionally calls flush_all_zero_pkmaps() to clear out old mappings - flush_all_zero_pkmaps() calls flush_tlb_kernel_range(), which on s/w IPI'd systems eventually calls smp_call_function_many() - smp_call_function_many() must not be called with IRQs disabled: WARNING: at kernel/smp.c:380 smp_call_function_many+0xc4/0x240() Modules linked in: Backtrace: [] (dump_backtrace+0x0/0x108) from [] (dump_stack+0x18/0x1c) r6:c007cd18 r5:c02ff228 r4:0000017c [] (dump_stack+0x0/0x1c) from [] (warn_slowpath_common+0x50/0x80) [] (warn_slowpath_common+0x0/0x80) from [] (warn_slowpath_null+0x18/0x1c) r7:00000003 r6:00000001 r5:c1ff4000 r4:c035fa34 [] (warn_slowpath_null+0x0/0x1c) from [] (smp_call_function_many+0xc4/0x240) [] (smp_call_function_many+0x0/0x240) from [] (smp_call_function+0x2c/0x38) [] (smp_call_function+0x0/0x38) from [] (on_each_cpu+0x1c/0x38) [] (on_each_cpu+0x0/0x38) from [] (flush_tlb_kernel_range+0x50/0x58) r6:00000001 r5:00000800 r4:c05f3590 [] (flush_tlb_kernel_range+0x0/0x58) from [] (flush_all_zero_pkmaps+0xc0/0xe8) [] (flush_all_zero_pkmaps+0x0/0xe8) from [] (kmap_high+0x8c/0x1e0) [] (kmap_high+0x0/0x1e0) from [] (kmap+0x44/0x5c) [] (kmap+0x0/0x5c) from [] (cramfs_readpage+0x3c/0x194) [] (cramfs_readpage+0x0/0x194) from [] (__do_page_cache_readahead+0x1f0/0x290) [] (__do_page_cache_readahead+0x0/0x290) from [] (ra_submit+0x30/0x38) [] (ra_submit+0x0/0x38) from [] (filemap_fault+0x3dc/0x438) r4:c1819988 [] (filemap_fault+0x0/0x438) from [] (__do_fault+0x58/0x43c) [] (__do_fault+0x0/0x43c) from [] (handle_mm_fault+0x104/0x318) [] (handle_mm_fault+0x0/0x318) from [] (do_page_fault+0x188/0x1e4) [] (do_page_fault+0x0/0x1e4) from [] (do_translation_fault+0x7c/0x84) [] (do_translation_fault+0x0/0x84) from [] (do_DataAbort+0x40/0xa4) r8:c1ff5e20 r7:c0340120 r6:00000805 r5:c1ff5e54 r4:c03400d0 [] (do_DataAbort+0x0/0xa4) from [] (__dabt_svc+0x4c/0x60) ... So we disable highmem support on these systems. Signed-off-by: Russell King --- arch/arm/include/asm/smp_plat.h | 16 ++++++++++++++++ arch/arm/kernel/smp.c | 7 +------ arch/arm/mm/mmu.c | 37 +++++++++++++++++++++++++++++++++---- 3 files changed, 50 insertions(+), 10 deletions(-) create mode 100644 arch/arm/include/asm/smp_plat.h (limited to 'arch/arm/kernel') diff --git a/arch/arm/include/asm/smp_plat.h b/arch/arm/include/asm/smp_plat.h new file mode 100644 index 000000000000..59303e200845 --- /dev/null +++ b/arch/arm/include/asm/smp_plat.h @@ -0,0 +1,16 @@ +/* + * ARM specific SMP header, this contains our implementation + * details. + */ +#ifndef __ASMARM_SMP_PLAT_H +#define __ASMARM_SMP_PLAT_H + +#include + +/* all SMP configurations have the extended CPUID registers */ +static inline int tlb_ops_need_broadcast(void) +{ + return ((read_cpuid_ext(CPUID_EXT_MMFR3) >> 12) & 0xf) < 2; +} + +#endif diff --git a/arch/arm/kernel/smp.c b/arch/arm/kernel/smp.c index e0d32770bb3d..9d015ee5747a 100644 --- a/arch/arm/kernel/smp.c +++ b/arch/arm/kernel/smp.c @@ -36,6 +36,7 @@ #include #include #include +#include /* * as from 2.5, kernels no longer have an init_tasks structure @@ -586,12 +587,6 @@ struct tlb_args { unsigned long ta_end; }; -/* all SMP configurations have the extended CPUID registers */ -static inline int tlb_ops_need_broadcast(void) -{ - return ((read_cpuid_ext(CPUID_EXT_MMFR3) >> 12) & 0xf) < 2; -} - static inline void ipi_flush_tlb_all(void *ignored) { local_flush_tlb_all(); diff --git a/arch/arm/mm/mmu.c b/arch/arm/mm/mmu.c index ce551ec2cb23..02243eeccf50 100644 --- a/arch/arm/mm/mmu.c +++ b/arch/arm/mm/mmu.c @@ -21,6 +21,7 @@ #include #include #include +#include #include #include @@ -709,10 +710,6 @@ static void __init sanity_check_meminfo(void) if (meminfo.nr_banks >= NR_BANKS) { printk(KERN_CRIT "NR_BANKS too low, " "ignoring high memory\n"); - } else if (cache_is_vipt_aliasing()) { - printk(KERN_CRIT "HIGHMEM is not yet supported " - "with VIPT aliasing cache, " - "ignoring high memory\n"); } else { memmove(bank + 1, bank, (meminfo.nr_banks - i) * sizeof(*bank)); @@ -756,6 +753,38 @@ static void __init sanity_check_meminfo(void) #endif j++; } +#ifdef CONFIG_HIGHMEM + if (highmem) { + const char *reason = NULL; + + if (cache_is_vipt_aliasing()) { + /* + * Interactions between kmap and other mappings + * make highmem support with aliasing VIPT caches + * rather difficult. + */ + reason = "with VIPT aliasing cache"; +#ifdef CONFIG_SMP + } else if (tlb_ops_need_broadcast()) { + /* + * kmap_high needs to occasionally flush TLB entries, + * however, if the TLB entries need to be broadcast + * we may deadlock: + * kmap_high(irqs off)->flush_all_zero_pkmaps-> + * flush_tlb_kernel_range->smp_call_function_many + * (must not be called with irqs off) + */ + reason = "without hardware TLB ops broadcasting"; +#endif + } + if (reason) { + printk(KERN_CRIT "HIGHMEM is not supported %s, ignoring high memory\n", + reason); + while (j > 0 && meminfo.bank[j - 1].highmem) + j--; + } + } +#endif meminfo.nr_banks = j; } -- cgit v1.2.3 From 90140c30a7b8c77e8872a389d48678d78e58789f Mon Sep 17 00:00:00 2001 From: Russell King Date: Sun, 27 Sep 2009 21:04:48 +0100 Subject: ARM: Fix __cpuexit section mismatch warnings Fix: WARNING: vmlinux.o(.text+0x247c): Section mismatch in reference from the function cpu_idle() to the function .cpuexit.text:cpu_die() The function cpu_idle() references a function in an exit section. Often the function cpu_die() has valid usage outside the exit section and the fix is to remove the __cpuexit annotation of cpu_die. WARNING: vmlinux.o(.cpuexit.text+0x3c): Section mismatch in reference from the function cpu_die() to the function .cpuinit.text:secondary_start_kernel() The function __cpuexit cpu_die() references a function __cpuinit secondary_start_kernel(). This is often seen when error handling in the exit function uses functionality in the init path. The fix is often to remove the __cpuinit annotation of secondary_start_kernel() so it may be used outside an init section. Sam says: > The annotation of cpu_die() is wrong. > To be annotated __cpuexit the function shall: > - be used in exit context and only in exit context with HOTPLUG_CPU=n > - be used outside exit context with HOTPLUG_CPU=y So, this also means __cpu_disable(), __cpu_die() and twd_timer_stop() are also wrong. However, removing __cpuexit from cpu_die() creates: WARNING: vmlinux.o(.text+0x6834): Section mismatch in reference from the function cpu_die() to the function .cpuinit.text:secondary_start_kernel() The function cpu_die() references the function __cpuinit secondary_start_kernel(). This is often because cpu_die lacks a __cpuinit annotation or the annotation of secondary_start_kernel is wrong. so fix this using __ref. Signed-off-by: Russell King Acked-by: Sam Ravnborg --- arch/arm/kernel/smp.c | 6 +++--- arch/arm/kernel/smp_twd.c | 4 +++- 2 files changed, 6 insertions(+), 4 deletions(-) (limited to 'arch/arm/kernel') diff --git a/arch/arm/kernel/smp.c b/arch/arm/kernel/smp.c index 9d015ee5747a..57162af53dc9 100644 --- a/arch/arm/kernel/smp.c +++ b/arch/arm/kernel/smp.c @@ -154,7 +154,7 @@ int __cpuinit __cpu_up(unsigned int cpu) /* * __cpu_disable runs on the processor to be shutdown. */ -int __cpuexit __cpu_disable(void) +int __cpu_disable(void) { unsigned int cpu = smp_processor_id(); struct task_struct *p; @@ -201,7 +201,7 @@ int __cpuexit __cpu_disable(void) * called on the thread which is asking for a CPU to be shutdown - * waits until shutdown has completed, or it is timed out. */ -void __cpuexit __cpu_die(unsigned int cpu) +void __cpu_die(unsigned int cpu) { if (!platform_cpu_kill(cpu)) printk("CPU%u: unable to kill\n", cpu); @@ -215,7 +215,7 @@ void __cpuexit __cpu_die(unsigned int cpu) * of the other hotplug-cpu capable cores, so presumably coming * out of idle fixes this. */ -void __cpuexit cpu_die(void) +void __ref cpu_die(void) { unsigned int cpu = smp_processor_id(); diff --git a/arch/arm/kernel/smp_twd.c b/arch/arm/kernel/smp_twd.c index d8c88c633c6f..a73a34dccf2a 100644 --- a/arch/arm/kernel/smp_twd.c +++ b/arch/arm/kernel/smp_twd.c @@ -166,10 +166,12 @@ void __cpuinit twd_timer_setup(struct clock_event_device *clk) clockevents_register_device(clk); } +#ifdef CONFIG_HOTPLUG_CPU /* * take a local timer down */ -void __cpuexit twd_timer_stop(void) +void twd_timer_stop(void) { __raw_writel(0, twd_base + TWD_TIMER_CONTROL); } +#endif -- cgit v1.2.3 From aa45ee8fc0ee87c1711b5fe8eb3556d06530c39e Mon Sep 17 00:00:00 2001 From: Russell King Date: Mon, 28 Sep 2009 11:41:51 +0100 Subject: ARM: Ensure do_cache_op takes mmap_sem do_cache_op() uses find_vma() to validate its arguments without holding any locking. This means that the VMA could vanish beneath us. Fix this by taking a read lock on mmap_sem. Signed-off-by: Russell King --- arch/arm/kernel/traps.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'arch/arm/kernel') diff --git a/arch/arm/kernel/traps.c b/arch/arm/kernel/traps.c index 57eb0f6f6005..467b69ed1021 100644 --- a/arch/arm/kernel/traps.c +++ b/arch/arm/kernel/traps.c @@ -418,12 +418,14 @@ static int bad_syscall(int n, struct pt_regs *regs) static inline void do_cache_op(unsigned long start, unsigned long end, int flags) { + struct mm_struct *mm = current->active_mm; struct vm_area_struct *vma; if (end < start || flags) return; - vma = find_vma(current->active_mm, start); + down_read(&mm->mmap_sem); + vma = find_vma(mm, start); if (vma && vma->vm_start < end) { if (start < vma->vm_start) start = vma->vm_start; @@ -432,6 +434,7 @@ do_cache_op(unsigned long start, unsigned long end, int flags) flush_cache_user_range(vma, start, end); } + up_read(&mm->mmap_sem); } /* -- cgit v1.2.3 From 6176d39471943a2e574782cbf62deded19b96aa0 Mon Sep 17 00:00:00 2001 From: Dmitry Artamonow Date: Tue, 29 Sep 2009 06:12:37 +0100 Subject: ARM: 5734/1: arm: fix compilation of entry-common.S for older CPUs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Commit 181f817eaaca4c1f introduced some new code to entry-common.S Sadly, this new code uses 'bx' instruction which is available only on ARMv5 and higher CPUs. This causes following compilation errors when building kernel for StrongARM (ARMv4): arch/arm/kernel/entry-common.S: Assembler messages: arch/arm/kernel/entry-common.S:129: Error: selected processor does not support `bx ip' arch/arm/kernel/entry-common.S:138: Error: selected processor does not support `bx ip' Fix these errors by using 'mov pc' instead of 'bx'. Signed-off-by: Dmitry Artamonow Acked-by: Uwe Kleine-König Signed-off-by: Russell King --- arch/arm/kernel/entry-common.S | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/arm/kernel') diff --git a/arch/arm/kernel/entry-common.S b/arch/arm/kernel/entry-common.S index 807cfebb0f44..825db52e558a 100644 --- a/arch/arm/kernel/entry-common.S +++ b/arch/arm/kernel/entry-common.S @@ -126,7 +126,7 @@ ENTRY(__gnu_mcount_nc) cmp r0, r2 bne gnu_trace ldmia sp!, {r0-r3, ip, lr} - bx ip + mov pc, ip gnu_trace: ldr r1, [sp, #20] @ lr of instrumented routine @@ -135,7 +135,7 @@ gnu_trace: mov lr, pc mov pc, r2 ldmia sp!, {r0-r3, ip, lr} - bx ip + mov pc, ip ENTRY(mcount) stmdb sp!, {r0-r3, lr} -- cgit v1.2.3 From 31abdb744179159f8b605f56da5b197b188e1689 Mon Sep 17 00:00:00 2001 From: David Brown Date: Thu, 1 Oct 2009 17:43:29 +0100 Subject: ARM: 5739/1: ARM: allow empty ATAG_CORE From: David Brown The ATAG_CORE is allowed to be empty. Although this is handled by parse_tag_core(), __vet_atags during startup rejects this tag unless it contains data. Allow the initial tag to be either the full size, or empty. Signed-off-by: David Brown Signed-off-by: Russell King --- arch/arm/kernel/head-common.S | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'arch/arm/kernel') diff --git a/arch/arm/kernel/head-common.S b/arch/arm/kernel/head-common.S index 93ad576b2d74..885a7214418d 100644 --- a/arch/arm/kernel/head-common.S +++ b/arch/arm/kernel/head-common.S @@ -13,6 +13,7 @@ #define ATAG_CORE 0x54410001 #define ATAG_CORE_SIZE ((2*4 + 3*4) >> 2) +#define ATAG_CORE_SIZE_EMPTY ((2*4) >> 2) .align 2 .type __switch_data, %object @@ -251,7 +252,8 @@ __vet_atags: bne 1f ldr r5, [r2, #0] @ is first tag ATAG_CORE? - subs r5, r5, #ATAG_CORE_SIZE + cmp r5, #ATAG_CORE_SIZE + cmpne r5, #ATAG_CORE_SIZE_EMPTY bne 1f ldr r5, [r2, #4] ldr r6, =ATAG_CORE -- cgit v1.2.3 From 4fb2847437d871fe579f820ceb18031db3359901 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Fri, 25 Sep 2009 13:39:47 +0100 Subject: ARM: 5727/1: Pass IFSR register to do_PrefetchAbort() Instruction fault status register, IFSR, was introduced on ARMv6 to provide status information about the last insturction fault. It needed for proper prefetch abort handling. Now we have three prefetch abort model: * legacy - for CPUs before ARMv6. They doesn't provide neither IFSR nor IFAR. We simulate IFSR with section translation fault status for them to generalize code; * ARMv6 - provides IFSR, but not IFAR; * ARMv7 - provides both IFSR and IFAR. Signed-off-by: Kirill A. Shutemov Signed-off-by: Russell King --- arch/arm/include/asm/glue.h | 26 ++++++++++++++----- arch/arm/kernel/entry-armv.S | 18 +++++-------- arch/arm/kernel/entry-common.S | 7 ------ arch/arm/mm/Kconfig | 57 ++++++++++++++++++++++-------------------- arch/arm/mm/Makefile | 4 +++ arch/arm/mm/fault.c | 2 +- arch/arm/mm/pabort-legacy.S | 19 ++++++++++++++ arch/arm/mm/pabort-v6.S | 19 ++++++++++++++ arch/arm/mm/pabort-v7.S | 20 +++++++++++++++ arch/arm/mm/proc-arm1020.S | 2 +- arch/arm/mm/proc-arm1020e.S | 2 +- arch/arm/mm/proc-arm1022.S | 2 +- arch/arm/mm/proc-arm1026.S | 2 +- arch/arm/mm/proc-arm6_7.S | 4 +-- arch/arm/mm/proc-arm720.S | 2 +- arch/arm/mm/proc-arm740.S | 2 +- arch/arm/mm/proc-arm7tdmi.S | 2 +- arch/arm/mm/proc-arm920.S | 2 +- arch/arm/mm/proc-arm922.S | 2 +- arch/arm/mm/proc-arm925.S | 2 +- arch/arm/mm/proc-arm926.S | 2 +- arch/arm/mm/proc-arm940.S | 2 +- arch/arm/mm/proc-arm946.S | 2 +- arch/arm/mm/proc-arm9tdmi.S | 2 +- arch/arm/mm/proc-fa526.S | 2 +- arch/arm/mm/proc-feroceon.S | 2 +- arch/arm/mm/proc-mohawk.S | 2 +- arch/arm/mm/proc-sa110.S | 2 +- arch/arm/mm/proc-sa1100.S | 2 +- arch/arm/mm/proc-v6.S | 2 +- arch/arm/mm/proc-v7.S | 2 +- arch/arm/mm/proc-xsc3.S | 2 +- arch/arm/mm/proc-xscale.S | 2 +- 33 files changed, 144 insertions(+), 78 deletions(-) create mode 100644 arch/arm/mm/pabort-legacy.S create mode 100644 arch/arm/mm/pabort-v6.S create mode 100644 arch/arm/mm/pabort-v7.S (limited to 'arch/arm/kernel') diff --git a/arch/arm/include/asm/glue.h b/arch/arm/include/asm/glue.h index a0e39d5d00c9..234a3fc1c78e 100644 --- a/arch/arm/include/asm/glue.h +++ b/arch/arm/include/asm/glue.h @@ -120,25 +120,39 @@ #endif /* - * Prefetch abort handler. If the CPU has an IFAR use that, otherwise - * use the address of the aborted instruction + * Prefetch Abort Model + * ================ + * + * We have the following to choose from: + * legacy - no IFSR, no IFAR + * v6 - ARMv6: IFSR, no IFAR + * v7 - ARMv7: IFSR and IFAR */ + #undef CPU_PABORT_HANDLER #undef MULTI_PABORT -#ifdef CONFIG_CPU_PABRT_IFAR +#ifdef CONFIG_CPU_PABRT_LEGACY +# ifdef CPU_PABORT_HANDLER +# define MULTI_PABORT 1 +# else +# define CPU_PABORT_HANDLER legacy_pabort +# endif +#endif + +#ifdef CONFIG_CPU_PABRT_V6 # ifdef CPU_PABORT_HANDLER # define MULTI_PABORT 1 # else -# define CPU_PABORT_HANDLER(reg, insn) mrc p15, 0, reg, cr6, cr0, 2 +# define CPU_PABORT_HANDLER v6_pabort # endif #endif -#ifdef CONFIG_CPU_PABRT_NOIFAR +#ifdef CONFIG_CPU_PABRT_V7 # ifdef CPU_PABORT_HANDLER # define MULTI_PABORT 1 # else -# define CPU_PABORT_HANDLER(reg, insn) mov reg, insn +# define CPU_PABORT_HANDLER v7_pabort # endif #endif diff --git a/arch/arm/kernel/entry-armv.S b/arch/arm/kernel/entry-armv.S index 0a2ba51cf35d..322410be573c 100644 --- a/arch/arm/kernel/entry-armv.S +++ b/arch/arm/kernel/entry-armv.S @@ -311,22 +311,16 @@ __pabt_svc: tst r3, #PSR_I_BIT biceq r9, r9, #PSR_I_BIT - @ - @ set args, then call main handler - @ - @ r0 - address of faulting instruction - @ r1 - pointer to registers on stack - @ -#ifdef MULTI_PABORT mov r0, r2 @ pass address of aborted instruction. +#ifdef MULTI_PABORT ldr r4, .LCprocfns mov lr, pc ldr pc, [r4, #PROCESSOR_PABT_FUNC] #else - CPU_PABORT_HANDLER(r0, r2) + bl CPU_PABORT_HANDLER #endif msr cpsr_c, r9 @ Maybe enable interrupts - mov r1, sp @ regs + mov r2, sp @ regs bl do_PrefetchAbort @ call abort handler @ @@ -701,16 +695,16 @@ ENDPROC(__und_usr_unknown) __pabt_usr: usr_entry -#ifdef MULTI_PABORT mov r0, r2 @ pass address of aborted instruction. +#ifdef MULTI_PABORT ldr r4, .LCprocfns mov lr, pc ldr pc, [r4, #PROCESSOR_PABT_FUNC] #else - CPU_PABORT_HANDLER(r0, r2) + bl CPU_PABORT_HANDLER #endif enable_irq @ Enable interrupts - mov r1, sp @ regs + mov r2, sp @ regs bl do_PrefetchAbort @ call abort handler UNWIND(.fnend ) /* fall through */ diff --git a/arch/arm/kernel/entry-common.S b/arch/arm/kernel/entry-common.S index 825db52e558a..f0fe95b7085d 100644 --- a/arch/arm/kernel/entry-common.S +++ b/arch/arm/kernel/entry-common.S @@ -425,13 +425,6 @@ sys_mmap2: #endif ENDPROC(sys_mmap2) -ENTRY(pabort_ifar) - mrc p15, 0, r0, cr6, cr0, 2 -ENTRY(pabort_noifar) - mov pc, lr -ENDPROC(pabort_ifar) -ENDPROC(pabort_noifar) - #ifdef CONFIG_OABI_COMPAT /* diff --git a/arch/arm/mm/Kconfig b/arch/arm/mm/Kconfig index 8d43e58f9244..e993140edd88 100644 --- a/arch/arm/mm/Kconfig +++ b/arch/arm/mm/Kconfig @@ -17,7 +17,7 @@ config CPU_ARM610 select CPU_CP15_MMU select CPU_COPY_V3 if MMU select CPU_TLB_V3 if MMU - select CPU_PABRT_NOIFAR + select CPU_PABRT_LEGACY help The ARM610 is the successor to the ARM3 processor and was produced by VLSI Technology Inc. @@ -31,7 +31,7 @@ config CPU_ARM7TDMI depends on !MMU select CPU_32v4T select CPU_ABRT_LV4T - select CPU_PABRT_NOIFAR + select CPU_PABRT_LEGACY select CPU_CACHE_V4 help A 32-bit RISC microprocessor based on the ARM7 processor core @@ -49,7 +49,7 @@ config CPU_ARM710 select CPU_CP15_MMU select CPU_COPY_V3 if MMU select CPU_TLB_V3 if MMU - select CPU_PABRT_NOIFAR + select CPU_PABRT_LEGACY help A 32-bit RISC microprocessor based on the ARM7 processor core designed by Advanced RISC Machines Ltd. The ARM710 is the @@ -64,7 +64,7 @@ config CPU_ARM720T bool "Support ARM720T processor" if ARCH_INTEGRATOR select CPU_32v4T select CPU_ABRT_LV4T - select CPU_PABRT_NOIFAR + select CPU_PABRT_LEGACY select CPU_CACHE_V4 select CPU_CACHE_VIVT select CPU_CP15_MMU @@ -83,7 +83,7 @@ config CPU_ARM740T depends on !MMU select CPU_32v4T select CPU_ABRT_LV4T - select CPU_PABRT_NOIFAR + select CPU_PABRT_LEGACY select CPU_CACHE_V3 # although the core is v4t select CPU_CP15_MPU help @@ -100,7 +100,7 @@ config CPU_ARM9TDMI depends on !MMU select CPU_32v4T select CPU_ABRT_NOMMU - select CPU_PABRT_NOIFAR + select CPU_PABRT_LEGACY select CPU_CACHE_V4 help A 32-bit RISC microprocessor based on the ARM9 processor core @@ -114,7 +114,7 @@ config CPU_ARM920T bool "Support ARM920T processor" if ARCH_INTEGRATOR select CPU_32v4T select CPU_ABRT_EV4T - select CPU_PABRT_NOIFAR + select CPU_PABRT_LEGACY select CPU_CACHE_V4WT select CPU_CACHE_VIVT select CPU_CP15_MMU @@ -135,7 +135,7 @@ config CPU_ARM922T bool "Support ARM922T processor" if ARCH_INTEGRATOR select CPU_32v4T select CPU_ABRT_EV4T - select CPU_PABRT_NOIFAR + select CPU_PABRT_LEGACY select CPU_CACHE_V4WT select CPU_CACHE_VIVT select CPU_CP15_MMU @@ -154,7 +154,7 @@ config CPU_ARM925T bool "Support ARM925T processor" if ARCH_OMAP1 select CPU_32v4T select CPU_ABRT_EV4T - select CPU_PABRT_NOIFAR + select CPU_PABRT_LEGACY select CPU_CACHE_V4WT select CPU_CACHE_VIVT select CPU_CP15_MMU @@ -173,7 +173,7 @@ config CPU_ARM926T bool "Support ARM926T processor" if ARCH_INTEGRATOR || MACH_REALVIEW_EB select CPU_32v5 select CPU_ABRT_EV5TJ - select CPU_PABRT_NOIFAR + select CPU_PABRT_LEGACY select CPU_CACHE_VIVT select CPU_CP15_MMU select CPU_COPY_V4WB if MMU @@ -191,7 +191,7 @@ config CPU_FA526 bool select CPU_32v4 select CPU_ABRT_EV4 - select CPU_PABRT_NOIFAR + select CPU_PABRT_LEGACY select CPU_CACHE_VIVT select CPU_CP15_MMU select CPU_CACHE_FA @@ -210,7 +210,7 @@ config CPU_ARM940T depends on !MMU select CPU_32v4T select CPU_ABRT_NOMMU - select CPU_PABRT_NOIFAR + select CPU_PABRT_LEGACY select CPU_CACHE_VIVT select CPU_CP15_MPU help @@ -228,7 +228,7 @@ config CPU_ARM946E depends on !MMU select CPU_32v5 select CPU_ABRT_NOMMU - select CPU_PABRT_NOIFAR + select CPU_PABRT_LEGACY select CPU_CACHE_VIVT select CPU_CP15_MPU help @@ -244,7 +244,7 @@ config CPU_ARM1020 bool "Support ARM1020T (rev 0) processor" if ARCH_INTEGRATOR select CPU_32v5 select CPU_ABRT_EV4T - select CPU_PABRT_NOIFAR + select CPU_PABRT_LEGACY select CPU_CACHE_V4WT select CPU_CACHE_VIVT select CPU_CP15_MMU @@ -262,7 +262,7 @@ config CPU_ARM1020E bool "Support ARM1020E processor" if ARCH_INTEGRATOR select CPU_32v5 select CPU_ABRT_EV4T - select CPU_PABRT_NOIFAR + select CPU_PABRT_LEGACY select CPU_CACHE_V4WT select CPU_CACHE_VIVT select CPU_CP15_MMU @@ -275,7 +275,7 @@ config CPU_ARM1022 bool "Support ARM1022E processor" if ARCH_INTEGRATOR select CPU_32v5 select CPU_ABRT_EV4T - select CPU_PABRT_NOIFAR + select CPU_PABRT_LEGACY select CPU_CACHE_VIVT select CPU_CP15_MMU select CPU_COPY_V4WB if MMU # can probably do better @@ -293,7 +293,7 @@ config CPU_ARM1026 bool "Support ARM1026EJ-S processor" if ARCH_INTEGRATOR select CPU_32v5 select CPU_ABRT_EV5T # But need Jazelle, but EV5TJ ignores bit 10 - select CPU_PABRT_NOIFAR + select CPU_PABRT_LEGACY select CPU_CACHE_VIVT select CPU_CP15_MMU select CPU_COPY_V4WB if MMU # can probably do better @@ -311,7 +311,7 @@ config CPU_SA110 select CPU_32v3 if ARCH_RPC select CPU_32v4 if !ARCH_RPC select CPU_ABRT_EV4 - select CPU_PABRT_NOIFAR + select CPU_PABRT_LEGACY select CPU_CACHE_V4WB select CPU_CACHE_VIVT select CPU_CP15_MMU @@ -331,7 +331,7 @@ config CPU_SA1100 bool select CPU_32v4 select CPU_ABRT_EV4 - select CPU_PABRT_NOIFAR + select CPU_PABRT_LEGACY select CPU_CACHE_V4WB select CPU_CACHE_VIVT select CPU_CP15_MMU @@ -342,7 +342,7 @@ config CPU_XSCALE bool select CPU_32v5 select CPU_ABRT_EV5T - select CPU_PABRT_NOIFAR + select CPU_PABRT_LEGACY select CPU_CACHE_VIVT select CPU_CP15_MMU select CPU_TLB_V4WBI if MMU @@ -352,7 +352,7 @@ config CPU_XSC3 bool select CPU_32v5 select CPU_ABRT_EV5T - select CPU_PABRT_NOIFAR + select CPU_PABRT_LEGACY select CPU_CACHE_VIVT select CPU_CP15_MMU select CPU_TLB_V4WBI if MMU @@ -363,7 +363,7 @@ config CPU_MOHAWK bool select CPU_32v5 select CPU_ABRT_EV5T - select CPU_PABRT_NOIFAR + select CPU_PABRT_LEGACY select CPU_CACHE_VIVT select CPU_CP15_MMU select CPU_TLB_V4WBI if MMU @@ -374,7 +374,7 @@ config CPU_FEROCEON bool select CPU_32v5 select CPU_ABRT_EV5T - select CPU_PABRT_NOIFAR + select CPU_PABRT_LEGACY select CPU_CACHE_VIVT select CPU_CP15_MMU select CPU_COPY_FEROCEON if MMU @@ -394,7 +394,7 @@ config CPU_V6 bool "Support ARM V6 processor" if ARCH_INTEGRATOR || MACH_REALVIEW_EB || MACH_REALVIEW_PBX select CPU_32v6 select CPU_ABRT_EV6 - select CPU_PABRT_NOIFAR + select CPU_PABRT_V6 select CPU_CACHE_V6 select CPU_CACHE_VIPT select CPU_CP15_MMU @@ -420,7 +420,7 @@ config CPU_V7 select CPU_32v6K select CPU_32v7 select CPU_ABRT_EV7 - select CPU_PABRT_IFAR + select CPU_PABRT_V7 select CPU_CACHE_V7 select CPU_CACHE_VIPT select CPU_CP15_MMU @@ -482,10 +482,13 @@ config CPU_ABRT_EV6 config CPU_ABRT_EV7 bool -config CPU_PABRT_IFAR +config CPU_PABRT_LEGACY bool -config CPU_PABRT_NOIFAR +config CPU_PABRT_V6 + bool + +config CPU_PABRT_V7 bool # The cache model diff --git a/arch/arm/mm/Makefile b/arch/arm/mm/Makefile index 63e3f6dd0e21..055cb2aa8134 100644 --- a/arch/arm/mm/Makefile +++ b/arch/arm/mm/Makefile @@ -27,6 +27,10 @@ obj-$(CONFIG_CPU_ABRT_EV5TJ) += abort-ev5tj.o obj-$(CONFIG_CPU_ABRT_EV6) += abort-ev6.o obj-$(CONFIG_CPU_ABRT_EV7) += abort-ev7.o +obj-$(CONFIG_CPU_PABRT_LEGACY) += pabort-legacy.o +obj-$(CONFIG_CPU_PABRT_V6) += pabort-v6.o +obj-$(CONFIG_CPU_PABRT_V7) += pabort-v7.o + obj-$(CONFIG_CPU_CACHE_V3) += cache-v3.o obj-$(CONFIG_CPU_CACHE_V4) += cache-v4.o obj-$(CONFIG_CPU_CACHE_V4WT) += cache-v4wt.o diff --git a/arch/arm/mm/fault.c b/arch/arm/mm/fault.c index 379f78556055..fd2375f84994 100644 --- a/arch/arm/mm/fault.c +++ b/arch/arm/mm/fault.c @@ -520,7 +520,7 @@ do_DataAbort(unsigned long addr, unsigned int fsr, struct pt_regs *regs) } asmlinkage void __exception -do_PrefetchAbort(unsigned long addr, struct pt_regs *regs) +do_PrefetchAbort(unsigned long addr, unsigned int ifsr, struct pt_regs *regs) { do_translation_fault(addr, FSR_LNX_PF, regs); } diff --git a/arch/arm/mm/pabort-legacy.S b/arch/arm/mm/pabort-legacy.S new file mode 100644 index 000000000000..87970eba88ea --- /dev/null +++ b/arch/arm/mm/pabort-legacy.S @@ -0,0 +1,19 @@ +#include +#include + +/* + * Function: legacy_pabort + * + * Params : r0 = address of aborted instruction + * + * Returns : r0 = address of abort + * : r1 = Simulated IFSR with section translation fault status + * + * Purpose : obtain information about current prefetch abort. + */ + + .align 5 +ENTRY(legacy_pabort) + mov r1, #5 + mov pc, lr +ENDPROC(legacy_pabort) diff --git a/arch/arm/mm/pabort-v6.S b/arch/arm/mm/pabort-v6.S new file mode 100644 index 000000000000..06e3d1ef2115 --- /dev/null +++ b/arch/arm/mm/pabort-v6.S @@ -0,0 +1,19 @@ +#include +#include + +/* + * Function: v6_pabort + * + * Params : r0 = address of aborted instruction + * + * Returns : r0 = address of abort + * : r1 = IFSR + * + * Purpose : obtain information about current prefetch abort. + */ + + .align 5 +ENTRY(v6_pabort) + mrc p15, 0, r1, c5, c0, 1 @ get IFSR + mov pc, lr +ENDPROC(v6_pabort) diff --git a/arch/arm/mm/pabort-v7.S b/arch/arm/mm/pabort-v7.S new file mode 100644 index 000000000000..a8b3b300a18d --- /dev/null +++ b/arch/arm/mm/pabort-v7.S @@ -0,0 +1,20 @@ +#include +#include + +/* + * Function: v6_pabort + * + * Params : r0 = address of aborted instruction + * + * Returns : r0 = address of abort + * : r1 = IFSR + * + * Purpose : obtain information about current prefetch abort. + */ + + .align 5 +ENTRY(v7_pabort) + mrc p15, 0, r0, c6, c0, 2 @ get IFAR + mrc p15, 0, r1, c5, c0, 1 @ get IFSR + mov pc, lr +ENDPROC(v7_pabort) diff --git a/arch/arm/mm/proc-arm1020.S b/arch/arm/mm/proc-arm1020.S index b5551bf010aa..d9fb4b98c49f 100644 --- a/arch/arm/mm/proc-arm1020.S +++ b/arch/arm/mm/proc-arm1020.S @@ -449,7 +449,7 @@ arm1020_crval: .type arm1020_processor_functions, #object arm1020_processor_functions: .word v4t_early_abort - .word pabort_noifar + .word legacy_pabort .word cpu_arm1020_proc_init .word cpu_arm1020_proc_fin .word cpu_arm1020_reset diff --git a/arch/arm/mm/proc-arm1020e.S b/arch/arm/mm/proc-arm1020e.S index 8bc6740c29eb..7453b75dcea5 100644 --- a/arch/arm/mm/proc-arm1020e.S +++ b/arch/arm/mm/proc-arm1020e.S @@ -430,7 +430,7 @@ arm1020e_crval: .type arm1020e_processor_functions, #object arm1020e_processor_functions: .word v4t_early_abort - .word pabort_noifar + .word legacy_pabort .word cpu_arm1020e_proc_init .word cpu_arm1020e_proc_fin .word cpu_arm1020e_reset diff --git a/arch/arm/mm/proc-arm1022.S b/arch/arm/mm/proc-arm1022.S index 2cd03e66c0a3..8eb72d75a8b6 100644 --- a/arch/arm/mm/proc-arm1022.S +++ b/arch/arm/mm/proc-arm1022.S @@ -413,7 +413,7 @@ arm1022_crval: .type arm1022_processor_functions, #object arm1022_processor_functions: .word v4t_early_abort - .word pabort_noifar + .word legacy_pabort .word cpu_arm1022_proc_init .word cpu_arm1022_proc_fin .word cpu_arm1022_reset diff --git a/arch/arm/mm/proc-arm1026.S b/arch/arm/mm/proc-arm1026.S index ad961a897f6e..3b59f0d67139 100644 --- a/arch/arm/mm/proc-arm1026.S +++ b/arch/arm/mm/proc-arm1026.S @@ -408,7 +408,7 @@ arm1026_crval: .type arm1026_processor_functions, #object arm1026_processor_functions: .word v5t_early_abort - .word pabort_noifar + .word legacy_pabort .word cpu_arm1026_proc_init .word cpu_arm1026_proc_fin .word cpu_arm1026_reset diff --git a/arch/arm/mm/proc-arm6_7.S b/arch/arm/mm/proc-arm6_7.S index 80d6e1de069a..3f9cd3d8f6d5 100644 --- a/arch/arm/mm/proc-arm6_7.S +++ b/arch/arm/mm/proc-arm6_7.S @@ -278,7 +278,7 @@ __arm7_setup: mov r0, #0 .type arm6_processor_functions, #object ENTRY(arm6_processor_functions) .word cpu_arm6_data_abort - .word pabort_noifar + .word legacy_pabort .word cpu_arm6_proc_init .word cpu_arm6_proc_fin .word cpu_arm6_reset @@ -295,7 +295,7 @@ ENTRY(arm6_processor_functions) .type arm7_processor_functions, #object ENTRY(arm7_processor_functions) .word cpu_arm7_data_abort - .word pabort_noifar + .word legacy_pabort .word cpu_arm7_proc_init .word cpu_arm7_proc_fin .word cpu_arm7_reset diff --git a/arch/arm/mm/proc-arm720.S b/arch/arm/mm/proc-arm720.S index 85ae18695f10..0b62de244666 100644 --- a/arch/arm/mm/proc-arm720.S +++ b/arch/arm/mm/proc-arm720.S @@ -181,7 +181,7 @@ arm720_crval: .type arm720_processor_functions, #object ENTRY(arm720_processor_functions) .word v4t_late_abort - .word pabort_noifar + .word legacy_pabort .word cpu_arm720_proc_init .word cpu_arm720_proc_fin .word cpu_arm720_reset diff --git a/arch/arm/mm/proc-arm740.S b/arch/arm/mm/proc-arm740.S index 4f95bee63e95..01860cdeb2ec 100644 --- a/arch/arm/mm/proc-arm740.S +++ b/arch/arm/mm/proc-arm740.S @@ -126,7 +126,7 @@ __arm740_setup: .type arm740_processor_functions, #object ENTRY(arm740_processor_functions) .word v4t_late_abort - .word pabort_noifar + .word legacy_pabort .word cpu_arm740_proc_init .word cpu_arm740_proc_fin .word cpu_arm740_reset diff --git a/arch/arm/mm/proc-arm7tdmi.S b/arch/arm/mm/proc-arm7tdmi.S index 93e05fa7bed4..1201b9863829 100644 --- a/arch/arm/mm/proc-arm7tdmi.S +++ b/arch/arm/mm/proc-arm7tdmi.S @@ -64,7 +64,7 @@ __arm7tdmi_setup: .type arm7tdmi_processor_functions, #object ENTRY(arm7tdmi_processor_functions) .word v4t_late_abort - .word pabort_noifar + .word legacy_pabort .word cpu_arm7tdmi_proc_init .word cpu_arm7tdmi_proc_fin .word cpu_arm7tdmi_reset diff --git a/arch/arm/mm/proc-arm920.S b/arch/arm/mm/proc-arm920.S index 914d688394fc..2b7c197cc58d 100644 --- a/arch/arm/mm/proc-arm920.S +++ b/arch/arm/mm/proc-arm920.S @@ -395,7 +395,7 @@ arm920_crval: .type arm920_processor_functions, #object arm920_processor_functions: .word v4t_early_abort - .word pabort_noifar + .word legacy_pabort .word cpu_arm920_proc_init .word cpu_arm920_proc_fin .word cpu_arm920_reset diff --git a/arch/arm/mm/proc-arm922.S b/arch/arm/mm/proc-arm922.S index 51c9c9859e58..06a1aa4e3398 100644 --- a/arch/arm/mm/proc-arm922.S +++ b/arch/arm/mm/proc-arm922.S @@ -399,7 +399,7 @@ arm922_crval: .type arm922_processor_functions, #object arm922_processor_functions: .word v4t_early_abort - .word pabort_noifar + .word legacy_pabort .word cpu_arm922_proc_init .word cpu_arm922_proc_fin .word cpu_arm922_reset diff --git a/arch/arm/mm/proc-arm925.S b/arch/arm/mm/proc-arm925.S index 2724526d89c1..cb53435a85ae 100644 --- a/arch/arm/mm/proc-arm925.S +++ b/arch/arm/mm/proc-arm925.S @@ -462,7 +462,7 @@ arm925_crval: .type arm925_processor_functions, #object arm925_processor_functions: .word v4t_early_abort - .word pabort_noifar + .word legacy_pabort .word cpu_arm925_proc_init .word cpu_arm925_proc_fin .word cpu_arm925_reset diff --git a/arch/arm/mm/proc-arm926.S b/arch/arm/mm/proc-arm926.S index 54466937bff9..1c4848704bb3 100644 --- a/arch/arm/mm/proc-arm926.S +++ b/arch/arm/mm/proc-arm926.S @@ -415,7 +415,7 @@ arm926_crval: .type arm926_processor_functions, #object arm926_processor_functions: .word v5tj_early_abort - .word pabort_noifar + .word legacy_pabort .word cpu_arm926_proc_init .word cpu_arm926_proc_fin .word cpu_arm926_reset diff --git a/arch/arm/mm/proc-arm940.S b/arch/arm/mm/proc-arm940.S index f595117caf55..5b0f8464c8f2 100644 --- a/arch/arm/mm/proc-arm940.S +++ b/arch/arm/mm/proc-arm940.S @@ -322,7 +322,7 @@ __arm940_setup: .type arm940_processor_functions, #object ENTRY(arm940_processor_functions) .word nommu_early_abort - .word pabort_noifar + .word legacy_pabort .word cpu_arm940_proc_init .word cpu_arm940_proc_fin .word cpu_arm940_reset diff --git a/arch/arm/mm/proc-arm946.S b/arch/arm/mm/proc-arm946.S index e03f6ff1fb26..40c0449a139b 100644 --- a/arch/arm/mm/proc-arm946.S +++ b/arch/arm/mm/proc-arm946.S @@ -377,7 +377,7 @@ __arm946_setup: .type arm946_processor_functions, #object ENTRY(arm946_processor_functions) .word nommu_early_abort - .word pabort_noifar + .word legacy_pabort .word cpu_arm946_proc_init .word cpu_arm946_proc_fin .word cpu_arm946_reset diff --git a/arch/arm/mm/proc-arm9tdmi.S b/arch/arm/mm/proc-arm9tdmi.S index be6c11d2b3fb..28545c29dbcd 100644 --- a/arch/arm/mm/proc-arm9tdmi.S +++ b/arch/arm/mm/proc-arm9tdmi.S @@ -64,7 +64,7 @@ __arm9tdmi_setup: .type arm9tdmi_processor_functions, #object ENTRY(arm9tdmi_processor_functions) .word nommu_early_abort - .word pabort_noifar + .word legacy_pabort .word cpu_arm9tdmi_proc_init .word cpu_arm9tdmi_proc_fin .word cpu_arm9tdmi_reset diff --git a/arch/arm/mm/proc-fa526.S b/arch/arm/mm/proc-fa526.S index 08b8a955d5d7..08f5ac237ad4 100644 --- a/arch/arm/mm/proc-fa526.S +++ b/arch/arm/mm/proc-fa526.S @@ -191,7 +191,7 @@ fa526_cr1_set: .type fa526_processor_functions, #object fa526_processor_functions: .word v4_early_abort - .word pabort_noifar + .word legacy_pabort .word cpu_fa526_proc_init .word cpu_fa526_proc_fin .word cpu_fa526_reset diff --git a/arch/arm/mm/proc-feroceon.S b/arch/arm/mm/proc-feroceon.S index 0fe1f8fc3488..d0d7795200fc 100644 --- a/arch/arm/mm/proc-feroceon.S +++ b/arch/arm/mm/proc-feroceon.S @@ -499,7 +499,7 @@ feroceon_crval: .type feroceon_processor_functions, #object feroceon_processor_functions: .word v5t_early_abort - .word pabort_noifar + .word legacy_pabort .word cpu_feroceon_proc_init .word cpu_feroceon_proc_fin .word cpu_feroceon_reset diff --git a/arch/arm/mm/proc-mohawk.S b/arch/arm/mm/proc-mohawk.S index 540f5078496b..52b5fd74fbb3 100644 --- a/arch/arm/mm/proc-mohawk.S +++ b/arch/arm/mm/proc-mohawk.S @@ -359,7 +359,7 @@ mohawk_crval: .type mohawk_processor_functions, #object mohawk_processor_functions: .word v5t_early_abort - .word pabort_noifar + .word legacy_pabort .word cpu_mohawk_proc_init .word cpu_mohawk_proc_fin .word cpu_mohawk_reset diff --git a/arch/arm/mm/proc-sa110.S b/arch/arm/mm/proc-sa110.S index 90a7e5279f29..7b706b389906 100644 --- a/arch/arm/mm/proc-sa110.S +++ b/arch/arm/mm/proc-sa110.S @@ -199,7 +199,7 @@ sa110_crval: .type sa110_processor_functions, #object ENTRY(sa110_processor_functions) .word v4_early_abort - .word pabort_noifar + .word legacy_pabort .word cpu_sa110_proc_init .word cpu_sa110_proc_fin .word cpu_sa110_reset diff --git a/arch/arm/mm/proc-sa1100.S b/arch/arm/mm/proc-sa1100.S index 451e2d953e2a..ee7700242c19 100644 --- a/arch/arm/mm/proc-sa1100.S +++ b/arch/arm/mm/proc-sa1100.S @@ -214,7 +214,7 @@ sa1100_crval: .type sa1100_processor_functions, #object ENTRY(sa1100_processor_functions) .word v4_early_abort - .word pabort_noifar + .word legacy_pabort .word cpu_sa1100_proc_init .word cpu_sa1100_proc_fin .word cpu_sa1100_reset diff --git a/arch/arm/mm/proc-v6.S b/arch/arm/mm/proc-v6.S index 524ddae92595..194737d60a22 100644 --- a/arch/arm/mm/proc-v6.S +++ b/arch/arm/mm/proc-v6.S @@ -191,7 +191,7 @@ v6_crval: .type v6_processor_functions, #object ENTRY(v6_processor_functions) .word v6_early_abort - .word pabort_noifar + .word v6_pabort .word cpu_v6_proc_init .word cpu_v6_proc_fin .word cpu_v6_reset diff --git a/arch/arm/mm/proc-v7.S b/arch/arm/mm/proc-v7.S index f3fa1c32fe92..23ebcf6eab9f 100644 --- a/arch/arm/mm/proc-v7.S +++ b/arch/arm/mm/proc-v7.S @@ -295,7 +295,7 @@ __v7_setup_stack: .type v7_processor_functions, #object ENTRY(v7_processor_functions) .word v7_early_abort - .word pabort_ifar + .word v7_pabort .word cpu_v7_proc_init .word cpu_v7_proc_fin .word cpu_v7_reset diff --git a/arch/arm/mm/proc-xsc3.S b/arch/arm/mm/proc-xsc3.S index 33515c214b92..2028f3702881 100644 --- a/arch/arm/mm/proc-xsc3.S +++ b/arch/arm/mm/proc-xsc3.S @@ -428,7 +428,7 @@ xsc3_crval: .type xsc3_processor_functions, #object ENTRY(xsc3_processor_functions) .word v5t_early_abort - .word pabort_noifar + .word legacy_pabort .word cpu_xsc3_proc_init .word cpu_xsc3_proc_fin .word cpu_xsc3_reset diff --git a/arch/arm/mm/proc-xscale.S b/arch/arm/mm/proc-xscale.S index 423394260bcb..f056c283682d 100644 --- a/arch/arm/mm/proc-xscale.S +++ b/arch/arm/mm/proc-xscale.S @@ -511,7 +511,7 @@ xscale_crval: .type xscale_processor_functions, #object ENTRY(xscale_processor_functions) .word v5t_early_abort - .word pabort_noifar + .word legacy_pabort .word cpu_xscale_proc_init .word cpu_xscale_proc_fin .word cpu_xscale_reset -- cgit v1.2.3