From 6a4a636fad018500c5db7a2b56a00caeb21cbb2c Mon Sep 17 00:00:00 2001 From: Jon Smirl Date: Sun, 20 Jul 2008 11:27:22 -0400 Subject: powerpc/mpc5200: Add AC97 register definitions for the MPC52xx PSC Needed by the PSC AC97 sound driver Signed-off-by: Jon Smirl Signed-off-by: Grant Likely --- include/asm-powerpc/mpc52xx_psc.h | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/asm-powerpc/mpc52xx_psc.h b/include/asm-powerpc/mpc52xx_psc.h index 710c5d36efaa..5467c2c0faa7 100644 --- a/include/asm-powerpc/mpc52xx_psc.h +++ b/include/asm-powerpc/mpc52xx_psc.h @@ -132,8 +132,12 @@ struct mpc52xx_psc { u8 reserved5[3]; u8 ctlr; /* PSC + 0x1c */ u8 reserved6[3]; - u16 ccr; /* PSC + 0x20 */ - u8 reserved7[14]; + /* BitClkDiv field of CCR is byte swapped in + * the hardware for mpc5200/b compatibility */ + u32 ccr; /* PSC + 0x20 */ + u32 ac97_slots; /* PSC + 0x24 */ + u32 ac97_cmd; /* PSC + 0x28 */ + u32 ac97_data; /* PSC + 0x2c */ u8 ivr; /* PSC + 0x30 */ u8 reserved8[3]; u8 ip; /* PSC + 0x34 */ -- cgit v1.2.3 From a19dd1bd7df839c52a668abcf288c2239442c3c9 Mon Sep 17 00:00:00 2001 From: Grant Likely Date: Tue, 22 Jul 2008 01:13:54 -0600 Subject: powerpc/mpc5200: add PSC SICR bit definitions Required by the PSC I2S audio driver. Signed-off-by: Grant Likely --- include/asm-powerpc/mpc52xx_psc.h | 32 +++++++++++++++++++++++++++++++- 1 file changed, 31 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/asm-powerpc/mpc52xx_psc.h b/include/asm-powerpc/mpc52xx_psc.h index 5467c2c0faa7..8917ed630565 100644 --- a/include/asm-powerpc/mpc52xx_psc.h +++ b/include/asm-powerpc/mpc52xx_psc.h @@ -60,10 +60,12 @@ #define MPC52xx_PSC_RXTX_FIFO_ALARM 0x0002 #define MPC52xx_PSC_RXTX_FIFO_EMPTY 0x0001 -/* PSC interrupt mask bits */ +/* PSC interrupt status/mask bits */ #define MPC52xx_PSC_IMR_TXRDY 0x0100 #define MPC52xx_PSC_IMR_RXRDY 0x0200 #define MPC52xx_PSC_IMR_DB 0x0400 +#define MPC52xx_PSC_IMR_TXEMP 0x0800 +#define MPC52xx_PSC_IMR_ORERR 0x1000 #define MPC52xx_PSC_IMR_IPC 0x8000 /* PSC input port change bit */ @@ -92,6 +94,34 @@ #define MPC52xx_PSC_RFNUM_MASK 0x01ff +#define MPC52xx_PSC_SICR_DTS1 (1 << 29) +#define MPC52xx_PSC_SICR_SHDR (1 << 28) +#define MPC52xx_PSC_SICR_SIM_MASK (0xf << 24) +#define MPC52xx_PSC_SICR_SIM_UART (0x0 << 24) +#define MPC52xx_PSC_SICR_SIM_UART_DCD (0x8 << 24) +#define MPC52xx_PSC_SICR_SIM_CODEC_8 (0x1 << 24) +#define MPC52xx_PSC_SICR_SIM_CODEC_16 (0x2 << 24) +#define MPC52xx_PSC_SICR_SIM_AC97 (0x3 << 24) +#define MPC52xx_PSC_SICR_SIM_SIR (0x8 << 24) +#define MPC52xx_PSC_SICR_SIM_SIR_DCD (0xc << 24) +#define MPC52xx_PSC_SICR_SIM_MIR (0x5 << 24) +#define MPC52xx_PSC_SICR_SIM_FIR (0x6 << 24) +#define MPC52xx_PSC_SICR_SIM_CODEC_24 (0x7 << 24) +#define MPC52xx_PSC_SICR_SIM_CODEC_32 (0xf << 24) +#define MPC52xx_PSC_SICR_GENCLK (1 << 23) +#define MPC52xx_PSC_SICR_I2S (1 << 22) +#define MPC52xx_PSC_SICR_CLKPOL (1 << 21) +#define MPC52xx_PSC_SICR_SYNCPOL (1 << 20) +#define MPC52xx_PSC_SICR_CELLSLAVE (1 << 19) +#define MPC52xx_PSC_SICR_CELL2XCLK (1 << 18) +#define MPC52xx_PSC_SICR_ESAI (1 << 17) +#define MPC52xx_PSC_SICR_ENAC97 (1 << 16) +#define MPC52xx_PSC_SICR_SPI (1 << 15) +#define MPC52xx_PSC_SICR_MSTR (1 << 14) +#define MPC52xx_PSC_SICR_CPOL (1 << 13) +#define MPC52xx_PSC_SICR_CPHA (1 << 12) +#define MPC52xx_PSC_SICR_USEEOF (1 << 11) +#define MPC52xx_PSC_SICR_DISABLEEOF (1 << 10) /* Structure of the hardware registers */ struct mpc52xx_psc { -- cgit v1.2.3 From 483fad1c3fa1060d7e6710e84a065ad514571739 Mon Sep 17 00:00:00 2001 From: Nathan Lynch Date: Tue, 22 Jul 2008 04:48:46 +1000 Subject: ELF loader support for auxvec base platform string Some IBM POWER-based platforms have the ability to run in a mode which mostly appears to the OS as a different processor from the actual hardware. For example, a Power6 system may appear to be a Power5+, which makes the AT_PLATFORM value "power5+". This means that programs are restricted to the ISA supported by Power5+; Power6-specific instructions are treated as illegal. However, some applications (virtual machines, optimized libraries) can benefit from knowledge of the underlying CPU model. A new aux vector entry, AT_BASE_PLATFORM, will denote the actual hardware. For example, on a Power6 system in Power5+ compatibility mode, AT_PLATFORM will be "power5+" and AT_BASE_PLATFORM will be "power6". The idea is that AT_PLATFORM indicates the instruction set supported, while AT_BASE_PLATFORM indicates the underlying microarchitecture. If the architecture has defined ELF_BASE_PLATFORM, copy that value to the user stack in the same manner as ELF_PLATFORM. Signed-off-by: Nathan Lynch Acked-by: Andrew Morton Signed-off-by: Benjamin Herrenschmidt --- fs/binfmt_elf.c | 28 ++++++++++++++++++++++++++++ include/linux/auxvec.h | 6 +++++- 2 files changed, 33 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index 639d2d8b5710..742c8f530481 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -131,6 +131,15 @@ static int padzero(unsigned long elf_bss) #define STACK_ALLOC(sp, len) ({ sp -= len ; sp; }) #endif +#ifndef ELF_BASE_PLATFORM +/* + * AT_BASE_PLATFORM indicates the "real" hardware/microarchitecture. + * If the arch defines ELF_BASE_PLATFORM (in asm/elf.h), the value + * will be copied to the user stack in the same manner as AT_PLATFORM. + */ +#define ELF_BASE_PLATFORM NULL +#endif + static int create_elf_tables(struct linux_binprm *bprm, struct elfhdr *exec, unsigned long load_addr, unsigned long interp_load_addr) @@ -142,7 +151,9 @@ create_elf_tables(struct linux_binprm *bprm, struct elfhdr *exec, elf_addr_t __user *envp; elf_addr_t __user *sp; elf_addr_t __user *u_platform; + elf_addr_t __user *u_base_platform; const char *k_platform = ELF_PLATFORM; + const char *k_base_platform = ELF_BASE_PLATFORM; int items; elf_addr_t *elf_info; int ei_index = 0; @@ -172,6 +183,19 @@ create_elf_tables(struct linux_binprm *bprm, struct elfhdr *exec, return -EFAULT; } + /* + * If this architecture has a "base" platform capability + * string, copy it to userspace. + */ + u_base_platform = NULL; + if (k_base_platform) { + size_t len = strlen(k_base_platform) + 1; + + u_base_platform = (elf_addr_t __user *)STACK_ALLOC(p, len); + if (__copy_to_user(u_base_platform, k_base_platform, len)) + return -EFAULT; + } + /* Create the ELF interpreter info */ elf_info = (elf_addr_t *)current->mm->saved_auxv; /* update AT_VECTOR_SIZE_BASE if the number of NEW_AUX_ENT() changes */ @@ -209,6 +233,10 @@ create_elf_tables(struct linux_binprm *bprm, struct elfhdr *exec, NEW_AUX_ENT(AT_PLATFORM, (elf_addr_t)(unsigned long)u_platform); } + if (k_base_platform) { + NEW_AUX_ENT(AT_BASE_PLATFORM, + (elf_addr_t)(unsigned long)u_base_platform); + } if (bprm->interp_flags & BINPRM_FLAGS_EXECFD) { NEW_AUX_ENT(AT_EXECFD, bprm->interp_data); } diff --git a/include/linux/auxvec.h b/include/linux/auxvec.h index 0da17d14fd13..d7afa9dd6635 100644 --- a/include/linux/auxvec.h +++ b/include/linux/auxvec.h @@ -26,9 +26,13 @@ #define AT_SECURE 23 /* secure mode boolean */ +#define AT_BASE_PLATFORM 24 /* string identifying real platform, may + * differ from AT_PLATFORM. */ + #define AT_EXECFN 31 /* filename of program */ + #ifdef __KERNEL__ -#define AT_VECTOR_SIZE_BASE 17 /* NEW_AUX_ENT entries in auxiliary table */ +#define AT_VECTOR_SIZE_BASE 18 /* NEW_AUX_ENT entries in auxiliary table */ /* number of "#define AT_.*" above, minus {AT_NULL, AT_IGNORE, AT_NOTELF} */ #endif -- cgit v1.2.3 From 9115d13453dee22473a1e8cacc90a8d64a9c4bc9 Mon Sep 17 00:00:00 2001 From: Nathan Lynch Date: Wed, 16 Jul 2008 09:58:51 +1000 Subject: powerpc: Enable AT_BASE_PLATFORM aux vector Stash the first platform string matched by identify_cpu() in powerpc_base_platform, and supply that to the ELF loader for the value of AT_BASE_PLATFORM. Signed-off-by: Nathan Lynch Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/kernel/cputable.c | 11 +++++++++++ include/asm-powerpc/cputable.h | 2 ++ include/asm-powerpc/elf.h | 8 ++++++++ 3 files changed, 21 insertions(+) (limited to 'include') diff --git a/arch/powerpc/kernel/cputable.c b/arch/powerpc/kernel/cputable.c index b936a1dd0a50..25a052c16754 100644 --- a/arch/powerpc/kernel/cputable.c +++ b/arch/powerpc/kernel/cputable.c @@ -23,6 +23,9 @@ struct cpu_spec* cur_cpu_spec = NULL; EXPORT_SYMBOL(cur_cpu_spec); +/* The platform string corresponding to the real PVR */ +const char *powerpc_base_platform; + /* NOTE: * Unlike ppc32, ppc64 will only call this once for the boot CPU, it's * the responsibility of the appropriate CPU save/restore functions to @@ -1652,6 +1655,14 @@ struct cpu_spec * __init identify_cpu(unsigned long offset, unsigned int pvr) } else *t = *s; *PTRRELOC(&cur_cpu_spec) = &the_cpu_spec; + + /* + * Set the base platform string once; assumes + * we're called with real pvr first. + */ + if (powerpc_base_platform == NULL) + powerpc_base_platform = t->platform; + #if defined(CONFIG_PPC64) || defined(CONFIG_BOOKE) /* ppc64 and booke expect identify_cpu to also call * setup_cpu for that processor. I will consolidate diff --git a/include/asm-powerpc/cputable.h b/include/asm-powerpc/cputable.h index 2a3e9075a5a0..ef8a248dfd55 100644 --- a/include/asm-powerpc/cputable.h +++ b/include/asm-powerpc/cputable.h @@ -127,6 +127,8 @@ extern struct cpu_spec *identify_cpu(unsigned long offset, unsigned int pvr); extern void do_feature_fixups(unsigned long value, void *fixup_start, void *fixup_end); +extern const char *powerpc_base_platform; + #endif /* __ASSEMBLY__ */ /* CPU kernel features */ diff --git a/include/asm-powerpc/elf.h b/include/asm-powerpc/elf.h index 89664675b469..80d1f399ee51 100644 --- a/include/asm-powerpc/elf.h +++ b/include/asm-powerpc/elf.h @@ -217,6 +217,14 @@ typedef elf_vrregset_t elf_fpxregset_t; #define ELF_PLATFORM (cur_cpu_spec->platform) +/* While ELF_PLATFORM indicates the ISA supported by the platform, it + * may not accurately reflect the underlying behavior of the hardware + * (as in the case of running in Power5+ compatibility mode on a + * Power6 machine). ELF_BASE_PLATFORM allows ld.so to load libraries + * that are tuned for the real hardware. + */ +#define ELF_BASE_PLATFORM (powerpc_base_platform) + #ifdef __powerpc64__ # define ELF_PLAT_INIT(_r, load_addr) do { \ _r->gpr[2] = load_addr; \ -- cgit v1.2.3 From d6a61bfc06d6f2248f3e75f208d64e794082013c Mon Sep 17 00:00:00 2001 From: Luis Machado Date: Thu, 24 Jul 2008 02:10:41 +1000 Subject: powerpc: BookE hardware watchpoint support This patch implements support for HW based watchpoint via the DBSR_DAC (Data Address Compare) facility of the BookE processors. It does so by interfacing with the existing DABR breakpoint code and adding the necessary bits and pieces for the new bits to be properly set or cleared Signed-off-by: Luis Machado Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/kernel/entry_32.S | 6 ++-- arch/powerpc/kernel/process.c | 46 +++++++++++++++++++++++++++ arch/powerpc/kernel/ptrace.c | 72 ++++++++++++++++++++++++++++++++++++++---- arch/powerpc/kernel/signal.c | 6 +++- arch/powerpc/kernel/traps.c | 16 ++++++++++ arch/powerpc/mm/fault.c | 25 --------------- include/asm-powerpc/system.h | 2 ++ 7 files changed, 138 insertions(+), 35 deletions(-) (limited to 'include') diff --git a/arch/powerpc/kernel/entry_32.S b/arch/powerpc/kernel/entry_32.S index da52269aec1e..81c8324a4a3c 100644 --- a/arch/powerpc/kernel/entry_32.S +++ b/arch/powerpc/kernel/entry_32.S @@ -148,7 +148,7 @@ transfer_to_handler: /* Check to see if the dbcr0 register is set up to debug. Use the internal debug mode bit to do this. */ lwz r12,THREAD_DBCR0(r12) - andis. r12,r12,DBCR0_IDM@h + andis. r12,r12,(DBCR0_IDM | DBSR_DAC1R | DBSR_DAC1W)@h beq+ 3f /* From user and task is ptraced - load up global dbcr0 */ li r12,-1 /* clear all pending debug events */ @@ -292,7 +292,7 @@ syscall_exit_cont: /* If the process has its own DBCR0 value, load it up. The internal debug mode bit tells us that dbcr0 should be loaded. */ lwz r0,THREAD+THREAD_DBCR0(r2) - andis. r10,r0,DBCR0_IDM@h + andis. r10,r0,(DBCR0_IDM | DBSR_DAC1R | DBSR_DAC1W)@h bnel- load_dbcr0 #endif #ifdef CONFIG_44x @@ -720,7 +720,7 @@ restore_user: /* Check whether this process has its own DBCR0 value. The internal debug mode bit tells us that dbcr0 should be loaded. */ lwz r0,THREAD+THREAD_DBCR0(r2) - andis. r10,r0,DBCR0_IDM@h + andis. r10,r0,(DBCR0_IDM | DBSR_DAC1R | DBSR_DAC1W)@h bnel- load_dbcr0 #endif diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c index 219f3634115e..db2497ccc111 100644 --- a/arch/powerpc/kernel/process.c +++ b/arch/powerpc/kernel/process.c @@ -47,6 +47,8 @@ #ifdef CONFIG_PPC64 #include #endif +#include +#include extern unsigned long _get_SP(void); @@ -239,6 +241,35 @@ void discard_lazy_cpu_state(void) } #endif /* CONFIG_SMP */ +void do_dabr(struct pt_regs *regs, unsigned long address, + unsigned long error_code) +{ + siginfo_t info; + + if (notify_die(DIE_DABR_MATCH, "dabr_match", regs, error_code, + 11, SIGSEGV) == NOTIFY_STOP) + return; + + if (debugger_dabr_match(regs)) + return; + + /* Clear the DAC and struct entries. One shot trigger */ +#if (defined(CONFIG_44x) || defined(CONFIG_BOOKE)) + mtspr(SPRN_DBCR0, mfspr(SPRN_DBCR0) & ~(DBSR_DAC1R | DBSR_DAC1W + | DBCR0_IDM)); +#endif + + /* Clear the DABR */ + set_dabr(0); + + /* Deliver the signal to userspace */ + info.si_signo = SIGTRAP; + info.si_errno = 0; + info.si_code = TRAP_HWBKPT; + info.si_addr = (void __user *)address; + force_sig_info(SIGTRAP, &info, current); +} + static DEFINE_PER_CPU(unsigned long, current_dabr); int set_dabr(unsigned long dabr) @@ -254,6 +285,11 @@ int set_dabr(unsigned long dabr) #if defined(CONFIG_PPC64) || defined(CONFIG_6xx) mtspr(SPRN_DABR, dabr); #endif + +#if defined(CONFIG_44x) || defined(CONFIG_BOOKE) + mtspr(SPRN_DAC1, dabr); +#endif + return 0; } @@ -337,6 +373,12 @@ struct task_struct *__switch_to(struct task_struct *prev, if (unlikely(__get_cpu_var(current_dabr) != new->thread.dabr)) set_dabr(new->thread.dabr); +#if defined(CONFIG_44x) || defined(CONFIG_BOOKE) + /* If new thread DAC (HW breakpoint) is the same then leave it */ + if (new->thread.dabr) + set_dabr(new->thread.dabr); +#endif + new_thread = &new->thread; old_thread = ¤t->thread; @@ -525,6 +567,10 @@ void flush_thread(void) if (current->thread.dabr) { current->thread.dabr = 0; set_dabr(0); + +#if defined(CONFIG_44x) || defined(CONFIG_BOOKE) + current->thread.dbcr0 &= ~(DBSR_DAC1R | DBSR_DAC1W); +#endif } } diff --git a/arch/powerpc/kernel/ptrace.c b/arch/powerpc/kernel/ptrace.c index 8feb93e7890c..a5d0e78779c8 100644 --- a/arch/powerpc/kernel/ptrace.c +++ b/arch/powerpc/kernel/ptrace.c @@ -703,7 +703,7 @@ void user_enable_single_step(struct task_struct *task) if (regs != NULL) { #if defined(CONFIG_40x) || defined(CONFIG_BOOKE) - task->thread.dbcr0 = DBCR0_IDM | DBCR0_IC; + task->thread.dbcr0 |= DBCR0_IDM | DBCR0_IC; regs->msr |= MSR_DE; #else regs->msr |= MSR_SE; @@ -716,9 +716,16 @@ void user_disable_single_step(struct task_struct *task) { struct pt_regs *regs = task->thread.regs; + +#if defined(CONFIG_44x) || defined(CONFIG_BOOKE) + /* If DAC then do not single step, skip */ + if (task->thread.dabr) + return; +#endif + if (regs != NULL) { #if defined(CONFIG_40x) || defined(CONFIG_BOOKE) - task->thread.dbcr0 = 0; + task->thread.dbcr0 &= ~(DBCR0_IC | DBCR0_IDM); regs->msr &= ~MSR_DE; #else regs->msr &= ~MSR_SE; @@ -727,22 +734,75 @@ void user_disable_single_step(struct task_struct *task) clear_tsk_thread_flag(task, TIF_SINGLESTEP); } -static int ptrace_set_debugreg(struct task_struct *task, unsigned long addr, +int ptrace_set_debugreg(struct task_struct *task, unsigned long addr, unsigned long data) { - /* We only support one DABR and no IABRS at the moment */ + /* For ppc64 we support one DABR and no IABR's at the moment (ppc64). + * For embedded processors we support one DAC and no IAC's at the + * moment. + */ if (addr > 0) return -EINVAL; - /* The bottom 3 bits are flags */ if ((data & ~0x7UL) >= TASK_SIZE) return -EIO; - /* Ensure translation is on */ +#ifdef CONFIG_PPC64 + + /* For processors using DABR (i.e. 970), the bottom 3 bits are flags. + * It was assumed, on previous implementations, that 3 bits were + * passed together with the data address, fitting the design of the + * DABR register, as follows: + * + * bit 0: Read flag + * bit 1: Write flag + * bit 2: Breakpoint translation + * + * Thus, we use them here as so. + */ + + /* Ensure breakpoint translation bit is set */ if (data && !(data & DABR_TRANSLATION)) return -EIO; + /* Move contents to the DABR register */ task->thread.dabr = data; + +#endif +#if defined(CONFIG_44x) || defined(CONFIG_BOOKE) + + /* As described above, it was assumed 3 bits were passed with the data + * address, but we will assume only the mode bits will be passed + * as to not cause alignment restrictions for DAC-based processors. + */ + + /* DAC's hold the whole address without any mode flags */ + task->thread.dabr = data & ~0x3UL; + + if (task->thread.dabr == 0) { + task->thread.dbcr0 &= ~(DBSR_DAC1R | DBSR_DAC1W | DBCR0_IDM); + task->thread.regs->msr &= ~MSR_DE; + return 0; + } + + /* Read or Write bits must be set */ + + if (!(data & 0x3UL)) + return -EINVAL; + + /* Set the Internal Debugging flag (IDM bit 1) for the DBCR0 + register */ + task->thread.dbcr0 = DBCR0_IDM; + + /* Check for write and read flags and set DBCR0 + accordingly */ + if (data & 0x1UL) + task->thread.dbcr0 |= DBSR_DAC1R; + if (data & 0x2UL) + task->thread.dbcr0 |= DBSR_DAC1W; + + task->thread.regs->msr |= MSR_DE; +#endif return 0; } diff --git a/arch/powerpc/kernel/signal.c b/arch/powerpc/kernel/signal.c index ad55488939c3..7aada783ec6a 100644 --- a/arch/powerpc/kernel/signal.c +++ b/arch/powerpc/kernel/signal.c @@ -145,8 +145,12 @@ int do_signal(sigset_t *oldset, struct pt_regs *regs) * user space. The DABR will have been cleared if it * triggered inside the kernel. */ - if (current->thread.dabr) + if (current->thread.dabr) { set_dabr(current->thread.dabr); +#if defined(CONFIG_44x) || defined(CONFIG_BOOKE) + mtspr(SPRN_DBCR0, current->thread.dbcr0); +#endif + } if (is32) { if (ka.sa.sa_flags & SA_SIGINFO) diff --git a/arch/powerpc/kernel/traps.c b/arch/powerpc/kernel/traps.c index 878fbddb6ae1..81ccb8dd1a54 100644 --- a/arch/powerpc/kernel/traps.c +++ b/arch/powerpc/kernel/traps.c @@ -1067,6 +1067,22 @@ void __kprobes DebugException(struct pt_regs *regs, unsigned long debug_status) } _exception(SIGTRAP, regs, TRAP_TRACE, regs->nip); + } else if (debug_status & (DBSR_DAC1R | DBSR_DAC1W)) { + regs->msr &= ~MSR_DE; + + if (user_mode(regs)) { + current->thread.dbcr0 &= ~(DBSR_DAC1R | DBSR_DAC1W | + DBCR0_IDM); + } else { + /* Disable DAC interupts */ + mtspr(SPRN_DBCR0, mfspr(SPRN_DBCR0) & ~(DBSR_DAC1R | + DBSR_DAC1W | DBCR0_IDM)); + + /* Clear the DAC event */ + mtspr(SPRN_DBSR, (DBSR_DAC1R | DBSR_DAC1W)); + } + /* Setup and send the trap to the handler */ + do_dabr(regs, mfspr(SPRN_DAC1), debug_status); } } #endif /* CONFIG_4xx || CONFIG_BOOKE */ diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c index 1707d00331fc..565b7a237c84 100644 --- a/arch/powerpc/mm/fault.c +++ b/arch/powerpc/mm/fault.c @@ -100,31 +100,6 @@ static int store_updates_sp(struct pt_regs *regs) return 0; } -#if !(defined(CONFIG_4xx) || defined(CONFIG_BOOKE)) -static void do_dabr(struct pt_regs *regs, unsigned long address, - unsigned long error_code) -{ - siginfo_t info; - - if (notify_die(DIE_DABR_MATCH, "dabr_match", regs, error_code, - 11, SIGSEGV) == NOTIFY_STOP) - return; - - if (debugger_dabr_match(regs)) - return; - - /* Clear the DABR */ - set_dabr(0); - - /* Deliver the signal to userspace */ - info.si_signo = SIGTRAP; - info.si_errno = 0; - info.si_code = TRAP_HWBKPT; - info.si_addr = (void __user *)address; - force_sig_info(SIGTRAP, &info, current); -} -#endif /* !(CONFIG_4xx || CONFIG_BOOKE)*/ - /* * For 600- and 800-family processors, the error_code parameter is DSISR * for a data fault, SRR1 for an instruction fault. For 400-family processors diff --git a/include/asm-powerpc/system.h b/include/asm-powerpc/system.h index e6e25e2364eb..d6648c143322 100644 --- a/include/asm-powerpc/system.h +++ b/include/asm-powerpc/system.h @@ -110,6 +110,8 @@ static inline int debugger_fault_handler(struct pt_regs *regs) { return 0; } #endif extern int set_dabr(unsigned long dabr); +extern void do_dabr(struct pt_regs *regs, unsigned long address, + unsigned long error_code); extern void print_backtrace(unsigned long *); extern void show_regs(struct pt_regs * regs); extern void flush_instruction_cache(void); -- cgit v1.2.3 From dfc3403f0e5ffb94ee29942f313b87d4061d951b Mon Sep 17 00:00:00 2001 From: Nathan Fontenot Date: Thu, 24 Jul 2008 04:27:30 +1000 Subject: powerpc/pseries: Add memory entitlement capabilities to /proc/ppc64/lparcfg Update /proc/ppc64/lparcfg to display Cooperative Memory Overcommitment statistics as reported by the H_GET_MPP hcall. This also updates the lparcfg interface to allow setting memory entitlement and weight. Signed-off-by: Nathan Fontenot Signed-off-by: Robert Jennings Acked-by: Paul Mackerras Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/kernel/lparcfg.c | 121 +++++++++++++++++++++++++++++++++++++++++- include/asm-powerpc/hvcall.h | 18 ++++++- 2 files changed, 137 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/arch/powerpc/kernel/lparcfg.c b/arch/powerpc/kernel/lparcfg.c index a0ca90ab5e39..86e5b3ed10d8 100644 --- a/arch/powerpc/kernel/lparcfg.c +++ b/arch/powerpc/kernel/lparcfg.c @@ -35,7 +35,7 @@ #include #include -#define MODULE_VERS "1.7" +#define MODULE_VERS "1.8" #define MODULE_NAME "lparcfg" /* #define LPARCFG_DEBUG */ @@ -129,6 +129,35 @@ static int iseries_lparcfg_data(struct seq_file *m, void *v) /* * Methods used to fetch LPAR data when running on a pSeries platform. */ +/** + * h_get_mpp + * H_GET_MPP hcall returns info in 7 parms + */ +int h_get_mpp(struct hvcall_mpp_data *mpp_data) +{ + int rc; + unsigned long retbuf[PLPAR_HCALL9_BUFSIZE]; + + rc = plpar_hcall9(H_GET_MPP, retbuf); + + mpp_data->entitled_mem = retbuf[0]; + mpp_data->mapped_mem = retbuf[1]; + + mpp_data->group_num = (retbuf[2] >> 2 * 8) & 0xffff; + mpp_data->pool_num = retbuf[2] & 0xffff; + + mpp_data->mem_weight = (retbuf[3] >> 7 * 8) & 0xff; + mpp_data->unallocated_mem_weight = (retbuf[3] >> 6 * 8) & 0xff; + mpp_data->unallocated_entitlement = retbuf[3] & 0xffffffffffff; + + mpp_data->pool_size = retbuf[4]; + mpp_data->loan_request = retbuf[5]; + mpp_data->backing_mem = retbuf[6]; + + return rc; +} +EXPORT_SYMBOL(h_get_mpp); + /* * H_GET_PPP hcall returns info in 4 parms. * entitled_capacity,unallocated_capacity, @@ -224,6 +253,44 @@ static void parse_ppp_data(struct seq_file *m) seq_printf(m, "unallocated_capacity=%ld\n", h_unallocated); } +/** + * parse_mpp_data + * Parse out data returned from h_get_mpp + */ +static void parse_mpp_data(struct seq_file *m) +{ + struct hvcall_mpp_data mpp_data; + int rc; + + rc = h_get_mpp(&mpp_data); + if (rc) + return; + + seq_printf(m, "entitled_memory=%ld\n", mpp_data.entitled_mem); + + if (mpp_data.mapped_mem != -1) + seq_printf(m, "mapped_entitled_memory=%ld\n", + mpp_data.mapped_mem); + + seq_printf(m, "entitled_memory_group_number=%d\n", mpp_data.group_num); + seq_printf(m, "entitled_memory_pool_number=%d\n", mpp_data.pool_num); + + seq_printf(m, "entitled_memory_weight=%d\n", mpp_data.mem_weight); + seq_printf(m, "unallocated_entitled_memory_weight=%d\n", + mpp_data.unallocated_mem_weight); + seq_printf(m, "unallocated_io_mapping_entitlement=%ld\n", + mpp_data.unallocated_entitlement); + + if (mpp_data.pool_size != -1) + seq_printf(m, "entitled_memory_pool_size=%ld bytes\n", + mpp_data.pool_size); + + seq_printf(m, "entitled_memory_loan_request=%ld\n", + mpp_data.loan_request); + + seq_printf(m, "backing_memory=%ld bytes\n", mpp_data.backing_mem); +} + #define SPLPAR_CHARACTERISTICS_TOKEN 20 #define SPLPAR_MAXLENGTH 1026*(sizeof(char)) @@ -351,6 +418,7 @@ static int pseries_lparcfg_data(struct seq_file *m, void *v) /* this call handles the ibm,get-system-parameter contents */ parse_system_parameter_string(m); parse_ppp_data(m); + parse_mpp_data(m); seq_printf(m, "purr=%ld\n", get_purr()); } else { /* non SPLPAR case */ @@ -414,6 +482,43 @@ static ssize_t update_ppp(u64 *entitlement, u8 *weight) return retval; } +/** + * update_mpp + * + * Update the memory entitlement and weight for the partition. Caller must + * specify either a new entitlement or weight, not both, to be updated + * since the h_set_mpp call takes both entitlement and weight as parameters. + */ +static ssize_t update_mpp(u64 *entitlement, u8 *weight) +{ + struct hvcall_mpp_data mpp_data; + u64 new_entitled; + u8 new_weight; + ssize_t rc; + + rc = h_get_mpp(&mpp_data); + if (rc) + return rc; + + if (entitlement) { + new_weight = mpp_data.mem_weight; + new_entitled = *entitlement; + } else if (weight) { + new_weight = *weight; + new_entitled = mpp_data.entitled_mem; + } else + return -EINVAL; + + pr_debug("%s: current_entitled = %lu, current_weight = %u\n", + __FUNCTION__, mpp_data.entitled_mem, mpp_data.mem_weight); + + pr_debug("%s: new_entitled = %lu, new_weight = %u\n", + __FUNCTION__, new_entitled, new_weight); + + rc = plpar_hcall_norets(H_SET_MPP, new_entitled, new_weight); + return rc; +} + /* * Interface for changing system parameters (variable capacity weight * and entitled capacity). Format of input is "param_name=value"; @@ -467,6 +572,20 @@ static ssize_t lparcfg_write(struct file *file, const char __user * buf, goto out; retval = update_ppp(NULL, new_weight_ptr); + } else if (!strcmp(kbuf, "entitled_memory")) { + char *endp; + *new_entitled_ptr = (u64) simple_strtoul(tmp, &endp, 10); + if (endp == tmp) + goto out; + + retval = update_mpp(new_entitled_ptr, NULL); + } else if (!strcmp(kbuf, "entitled_memory_weight")) { + char *endp; + *new_weight_ptr = (u8) simple_strtoul(tmp, &endp, 10); + if (endp == tmp) + goto out; + + retval = update_mpp(NULL, new_weight_ptr); } else goto out; diff --git a/include/asm-powerpc/hvcall.h b/include/asm-powerpc/hvcall.h index bf6cd7cb996c..46e76456cbbd 100644 --- a/include/asm-powerpc/hvcall.h +++ b/include/asm-powerpc/hvcall.h @@ -210,7 +210,9 @@ #define H_JOIN 0x298 #define H_VASI_STATE 0x2A4 #define H_ENABLE_CRQ 0x2B0 -#define MAX_HCALL_OPCODE H_ENABLE_CRQ +#define H_SET_MPP 0x2D0 +#define H_GET_MPP 0x2D4 +#define MAX_HCALL_OPCODE H_GET_MPP #ifndef __ASSEMBLY__ @@ -270,6 +272,20 @@ struct hcall_stats { }; #define HCALL_STAT_ARRAY_SIZE ((MAX_HCALL_OPCODE >> 2) + 1) +struct hvcall_mpp_data { + unsigned long entitled_mem; + unsigned long mapped_mem; + unsigned short group_num; + unsigned short pool_num; + unsigned char mem_weight; + unsigned char unallocated_mem_weight; + unsigned long unallocated_entitlement; /* value in bytes */ + unsigned long pool_size; + signed long loan_request; + unsigned long backing_mem; +}; + +int h_get_mpp(struct hvcall_mpp_data *); #endif /* __ASSEMBLY__ */ #endif /* __KERNEL__ */ #endif /* _ASM_POWERPC_HVCALL_H */ -- cgit v1.2.3 From e46de429cb954d30a5642fba81d516ede518c65e Mon Sep 17 00:00:00 2001 From: Robert Jennings Date: Thu, 24 Jul 2008 04:29:03 +1000 Subject: powerpc/pseries: Enable CMO feature during platform setup For Cooperative Memory Overcommitment (CMO), set the FW_FEATURE_CMO flag in powerpc_firmware_features from the rtas ibm,get-system-parameters table prior to calling iommu_init_early_pSeries. With this, any CMO specific functionality can be controlled by checking: firmware_has_feature(FW_FEATURE_CMO) Signed-off-by: Robert Jennings Acked-by: Paul Mackerras Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/platforms/pseries/setup.c | 71 ++++++++++++++++++++++++++++++++++ include/asm-powerpc/firmware.h | 3 +- 2 files changed, 73 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/arch/powerpc/platforms/pseries/setup.c b/arch/powerpc/platforms/pseries/setup.c index 90beb444e1dd..063a0d2fba30 100644 --- a/arch/powerpc/platforms/pseries/setup.c +++ b/arch/powerpc/platforms/pseries/setup.c @@ -314,6 +314,76 @@ static int pseries_set_xdabr(unsigned long dabr) H_DABRX_KERNEL | H_DABRX_USER); } +#define CMO_CHARACTERISTICS_TOKEN 44 +#define CMO_MAXLENGTH 1026 + +/** + * fw_cmo_feature_init - FW_FEATURE_CMO is not stored in ibm,hypertas-functions, + * handle that here. (Stolen from parse_system_parameter_string) + */ +void pSeries_cmo_feature_init(void) +{ + char *ptr, *key, *value, *end; + int call_status; + int PrPSP = -1; + int SecPSP = -1; + + pr_debug(" -> fw_cmo_feature_init()\n"); + spin_lock(&rtas_data_buf_lock); + memset(rtas_data_buf, 0, RTAS_DATA_BUF_SIZE); + call_status = rtas_call(rtas_token("ibm,get-system-parameter"), 3, 1, + NULL, + CMO_CHARACTERISTICS_TOKEN, + __pa(rtas_data_buf), + RTAS_DATA_BUF_SIZE); + + if (call_status != 0) { + spin_unlock(&rtas_data_buf_lock); + pr_debug("CMO not available\n"); + pr_debug(" <- fw_cmo_feature_init()\n"); + return; + } + + end = rtas_data_buf + CMO_MAXLENGTH - 2; + ptr = rtas_data_buf + 2; /* step over strlen value */ + key = value = ptr; + + while (*ptr && (ptr <= end)) { + /* Separate the key and value by replacing '=' with '\0' and + * point the value at the string after the '=' + */ + if (ptr[0] == '=') { + ptr[0] = '\0'; + value = ptr + 1; + } else if (ptr[0] == '\0' || ptr[0] == ',') { + /* Terminate the string containing the key/value pair */ + ptr[0] = '\0'; + + if (key == value) { + pr_debug("Malformed key/value pair\n"); + /* Never found a '=', end processing */ + break; + } + + if (0 == strcmp(key, "PrPSP")) + PrPSP = simple_strtol(value, NULL, 10); + else if (0 == strcmp(key, "SecPSP")) + SecPSP = simple_strtol(value, NULL, 10); + value = key = ptr + 1; + } + ptr++; + } + + if (PrPSP != -1 || SecPSP != -1) { + pr_info("CMO enabled\n"); + pr_debug("CMO enabled, PrPSP=%d, SecPSP=%d\n", PrPSP, SecPSP); + powerpc_firmware_features |= FW_FEATURE_CMO; + } else + pr_debug("CMO not enabled, PrPSP=%d, SecPSP=%d\n", PrPSP, SecPSP); + spin_unlock(&rtas_data_buf_lock); + pr_debug(" <- fw_cmo_feature_init()\n"); +} + /* * Early initialization. Relocation is on but do not reference unbolted pages */ @@ -329,6 +399,7 @@ static void __init pSeries_init_early(void) else if (firmware_has_feature(FW_FEATURE_XDABR)) ppc_md.set_dabr = pseries_set_xdabr; + pSeries_cmo_feature_init(); iommu_init_early_pSeries(); pr_debug(" <- pSeries_init_early()\n"); diff --git a/include/asm-powerpc/firmware.h b/include/asm-powerpc/firmware.h index ef328995ba9d..3a179827528d 100644 --- a/include/asm-powerpc/firmware.h +++ b/include/asm-powerpc/firmware.h @@ -46,6 +46,7 @@ #define FW_FEATURE_PS3_LV1 ASM_CONST(0x0000000000800000) #define FW_FEATURE_BEAT ASM_CONST(0x0000000001000000) #define FW_FEATURE_BULK_REMOVE ASM_CONST(0x0000000002000000) +#define FW_FEATURE_CMO ASM_CONST(0x0000000004000000) #ifndef __ASSEMBLY__ @@ -58,7 +59,7 @@ enum { FW_FEATURE_MIGRATE | FW_FEATURE_PERFMON | FW_FEATURE_CRQ | FW_FEATURE_VIO | FW_FEATURE_RDMA | FW_FEATURE_LLAN | FW_FEATURE_BULK | FW_FEATURE_XDABR | FW_FEATURE_MULTITCE | - FW_FEATURE_SPLPAR | FW_FEATURE_LPAR, + FW_FEATURE_SPLPAR | FW_FEATURE_LPAR | FW_FEATURE_CMO, FW_FEATURE_PSERIES_ALWAYS = 0, FW_FEATURE_ISERIES_POSSIBLE = FW_FEATURE_ISERIES | FW_FEATURE_LPAR, FW_FEATURE_ISERIES_ALWAYS = FW_FEATURE_ISERIES | FW_FEATURE_LPAR, -- cgit v1.2.3 From 86630a32320f83736c4c24e2c8bae218e4c56c7c Mon Sep 17 00:00:00 2001 From: Brian King Date: Thu, 24 Jul 2008 04:29:16 +1000 Subject: powerpc/pseries: Utilities to set firmware page state Newer versions of firmware support page states, which are used by the collaborative memory manager (future patch) to "loan" pages to the hypervisor for use by other partitions. Signed-off-by: Brian King Signed-off-by: Robert Jennings Acked-by: Paul Mackerras Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/platforms/pseries/plpar_wrappers.h | 10 ++++++++++ include/asm-powerpc/hvcall.h | 5 +++++ 2 files changed, 15 insertions(+) (limited to 'include') diff --git a/arch/powerpc/platforms/pseries/plpar_wrappers.h b/arch/powerpc/platforms/pseries/plpar_wrappers.h index d8680b589dc9..a437267c6bf8 100644 --- a/arch/powerpc/platforms/pseries/plpar_wrappers.h +++ b/arch/powerpc/platforms/pseries/plpar_wrappers.h @@ -42,6 +42,16 @@ static inline long register_slb_shadow(unsigned long cpu, unsigned long vpa) return vpa_call(0x3, cpu, vpa); } +static inline long plpar_page_set_loaned(unsigned long vpa) +{ + return plpar_hcall_norets(H_PAGE_INIT, H_PAGE_SET_LOANED, vpa, 0); +} + +static inline long plpar_page_set_active(unsigned long vpa) +{ + return plpar_hcall_norets(H_PAGE_INIT, H_PAGE_SET_ACTIVE, vpa, 0); +} + extern void vpa_init(int cpu); static inline long plpar_pte_enter(unsigned long flags, diff --git a/include/asm-powerpc/hvcall.h b/include/asm-powerpc/hvcall.h index 46e76456cbbd..fbe2932fa9e9 100644 --- a/include/asm-powerpc/hvcall.h +++ b/include/asm-powerpc/hvcall.h @@ -92,6 +92,11 @@ #define H_EXACT (1UL<<(63-24)) /* Use exact PTE or return H_PTEG_FULL */ #define H_R_XLATE (1UL<<(63-25)) /* include a valid logical page num in the pte if the valid bit is set */ #define H_READ_4 (1UL<<(63-26)) /* Return 4 PTEs */ +#define H_PAGE_STATE_CHANGE (1UL<<(63-28)) +#define H_PAGE_UNUSED ((1UL<<(63-29)) | (1UL<<(63-30))) +#define H_PAGE_SET_UNUSED (H_PAGE_STATE_CHANGE | H_PAGE_UNUSED) +#define H_PAGE_SET_LOANED (H_PAGE_SET_UNUSED | (1UL<<(63-31))) +#define H_PAGE_SET_ACTIVE H_PAGE_STATE_CHANGE #define H_AVPN (1UL<<(63-32)) /* An avpn is provided as a sanity test */ #define H_ANDCOND (1UL<<(63-33)) #define H_ICACHE_INVALIDATE (1UL<<(63-40)) /* icbi, etc. (ignored for IO pages) */ -- cgit v1.2.3 From ffa5abbd0c399b32fc13a1b4718d87ee7a716999 Mon Sep 17 00:00:00 2001 From: Brian King Date: Thu, 24 Jul 2008 04:30:58 +1000 Subject: powerpc/pseries: Add CMO paging statistics With the addition of Cooperative Memory Overcommitment (CMO) support for IBM Power Systems, two fields have been added to the VPA to report paging statistics. Add support in lparcfg to report them to userspace. Signed-off-by: Brian King Signed-off-by: Robert Jennings Acked-by: Paul Mackerras Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/kernel/lparcfg.c | 20 ++++++++++++++++++++ include/asm-powerpc/lppaca.h | 5 ++++- 2 files changed, 24 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/arch/powerpc/kernel/lparcfg.c b/arch/powerpc/kernel/lparcfg.c index d82e1fa5ce2a..848c3e5a6370 100644 --- a/arch/powerpc/kernel/lparcfg.c +++ b/arch/powerpc/kernel/lparcfg.c @@ -409,6 +409,25 @@ static int lparcfg_count_active_processors(void) return count; } +static void pseries_cmo_data(struct seq_file *m) +{ + int cpu; + unsigned long cmo_faults = 0; + unsigned long cmo_fault_time = 0; + + if (!firmware_has_feature(FW_FEATURE_CMO)) + return; + + for_each_possible_cpu(cpu) { + cmo_faults += lppaca[cpu].cmo_faults; + cmo_fault_time += lppaca[cpu].cmo_fault_time; + } + + seq_printf(m, "cmo_faults=%lu\n", cmo_faults); + seq_printf(m, "cmo_fault_time_usec=%lu\n", + cmo_fault_time / tb_ticks_per_usec); +} + static int pseries_lparcfg_data(struct seq_file *m, void *v) { int partition_potential_processors; @@ -434,6 +453,7 @@ static int pseries_lparcfg_data(struct seq_file *m, void *v) parse_system_parameter_string(m); parse_ppp_data(m); parse_mpp_data(m); + pseries_cmo_data(m); seq_printf(m, "purr=%ld\n", get_purr()); } else { /* non SPLPAR case */ diff --git a/include/asm-powerpc/lppaca.h b/include/asm-powerpc/lppaca.h index 567ed92cd91f..2fe268b10333 100644 --- a/include/asm-powerpc/lppaca.h +++ b/include/asm-powerpc/lppaca.h @@ -125,7 +125,10 @@ struct lppaca { // NOTE: This value will ALWAYS be zero for dedicated processors and // will NEVER be zero for shared processors (ie, initialized to a 1). volatile u32 yield_count; // PLIC increments each dispatchx00-x03 - u8 reserved6[124]; // Reserved x04-x7F + u32 reserved6; + volatile u64 cmo_faults; // CMO page fault count x08-x0F + volatile u64 cmo_fault_time; // CMO page fault time x10-x17 + u8 reserved7[104]; // Reserved x18-x7F //============================================================================= // CACHE_LINE_4-5 0x0180 - 0x027F Contains PMC interrupt data -- cgit v1.2.3 From 6490c4903d12f242bec4454301f76f6a7520e399 Mon Sep 17 00:00:00 2001 From: Robert Jennings Date: Thu, 24 Jul 2008 04:31:16 +1000 Subject: powerpc/pseries: iommu enablement for CMO To support Cooperative Memory Overcommitment (CMO), we need to check for failure from some of the tce hcalls. These changes for the pseries platform affect the powerpc architecture; patches for the other affected platforms are included in this patch. pSeries platform IOMMU code changes: * platform TCE functions must handle H_NOT_ENOUGH_RESOURCES errors and return an error. Architecture IOMMU code changes: * Calls to ppc_md.tce_build need to check return values and return DMA_MAPPING_ERROR for transient errors. Architecture changes: * struct machdep_calls for tce_build*_pSeriesLP functions need to change to indicate failure. * all other platforms will need updates to iommu functions to match the new calling semantics; they will return 0 on success. The other platforms default configs have been built, but no further testing was performed. Signed-off-by: Robert Jennings Acked-by: Olof Johansson Acked-by: Paul Mackerras Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/kernel/iommu.c | 28 +++++++++++++++++++---- arch/powerpc/platforms/cell/iommu.c | 3 ++- arch/powerpc/platforms/iseries/iommu.c | 3 ++- arch/powerpc/platforms/pasemi/iommu.c | 3 ++- arch/powerpc/platforms/pseries/iommu.c | 42 ++++++++++++++++++++++++++-------- arch/powerpc/sysdev/dart_iommu.c | 3 ++- include/asm-powerpc/machdep.h | 2 +- 7 files changed, 64 insertions(+), 20 deletions(-) (limited to 'include') diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c index 2385f68c1751..550a19399bfa 100644 --- a/arch/powerpc/kernel/iommu.c +++ b/arch/powerpc/kernel/iommu.c @@ -49,6 +49,8 @@ static int novmerge = 1; static int protect4gb = 1; +static void __iommu_free(struct iommu_table *, dma_addr_t, unsigned int); + static inline unsigned long iommu_num_pages(unsigned long vaddr, unsigned long slen) { @@ -191,6 +193,7 @@ static dma_addr_t iommu_alloc(struct device *dev, struct iommu_table *tbl, { unsigned long entry, flags; dma_addr_t ret = DMA_ERROR_CODE; + int build_fail; spin_lock_irqsave(&(tbl->it_lock), flags); @@ -205,9 +208,21 @@ static dma_addr_t iommu_alloc(struct device *dev, struct iommu_table *tbl, ret = entry << IOMMU_PAGE_SHIFT; /* Set the return dma address */ /* Put the TCEs in the HW table */ - ppc_md.tce_build(tbl, entry, npages, (unsigned long)page & IOMMU_PAGE_MASK, - direction, attrs); + build_fail = ppc_md.tce_build(tbl, entry, npages, + (unsigned long)page & IOMMU_PAGE_MASK, + direction, attrs); + + /* ppc_md.tce_build() only returns non-zero for transient errors. + * Clean up the table bitmap in this case and return + * DMA_ERROR_CODE. For all other errors the functionality is + * not altered. + */ + if (unlikely(build_fail)) { + __iommu_free(tbl, ret, npages); + spin_unlock_irqrestore(&(tbl->it_lock), flags); + return DMA_ERROR_CODE; + } /* Flush/invalidate TLB caches if necessary */ if (ppc_md.tce_flush) @@ -276,7 +291,7 @@ int iommu_map_sg(struct device *dev, struct iommu_table *tbl, dma_addr_t dma_next = 0, dma_addr; unsigned long flags; struct scatterlist *s, *outs, *segstart; - int outcount, incount, i; + int outcount, incount, i, build_fail = 0; unsigned int align; unsigned long handle; unsigned int max_seg_size; @@ -337,8 +352,11 @@ int iommu_map_sg(struct device *dev, struct iommu_table *tbl, npages, entry, dma_addr); /* Insert into HW table */ - ppc_md.tce_build(tbl, entry, npages, vaddr & IOMMU_PAGE_MASK, - direction, attrs); + build_fail = ppc_md.tce_build(tbl, entry, npages, + vaddr & IOMMU_PAGE_MASK, + direction, attrs); + if(unlikely(build_fail)) + goto failure; /* If we are in an open segment, try merging */ if (segstart != s) { diff --git a/arch/powerpc/platforms/cell/iommu.c b/arch/powerpc/platforms/cell/iommu.c index 031124a8e37b..e06420af5fe9 100644 --- a/arch/powerpc/platforms/cell/iommu.c +++ b/arch/powerpc/platforms/cell/iommu.c @@ -172,7 +172,7 @@ static void invalidate_tce_cache(struct cbe_iommu *iommu, unsigned long *pte, } } -static void tce_build_cell(struct iommu_table *tbl, long index, long npages, +static int tce_build_cell(struct iommu_table *tbl, long index, long npages, unsigned long uaddr, enum dma_data_direction direction, struct dma_attrs *attrs) { @@ -213,6 +213,7 @@ static void tce_build_cell(struct iommu_table *tbl, long index, long npages, pr_debug("tce_build_cell(index=%lx,n=%lx,dir=%d,base_pte=%lx)\n", index, npages, direction, base_pte); + return 0; } static void tce_free_cell(struct iommu_table *tbl, long index, long npages) diff --git a/arch/powerpc/platforms/iseries/iommu.c b/arch/powerpc/platforms/iseries/iommu.c index bc818e4e2033..bb464d1211b2 100644 --- a/arch/powerpc/platforms/iseries/iommu.c +++ b/arch/powerpc/platforms/iseries/iommu.c @@ -41,7 +41,7 @@ #include #include -static void tce_build_iSeries(struct iommu_table *tbl, long index, long npages, +static int tce_build_iSeries(struct iommu_table *tbl, long index, long npages, unsigned long uaddr, enum dma_data_direction direction, struct dma_attrs *attrs) { @@ -71,6 +71,7 @@ static void tce_build_iSeries(struct iommu_table *tbl, long index, long npages, index++; uaddr += TCE_PAGE_SIZE; } + return 0; } static void tce_free_iSeries(struct iommu_table *tbl, long index, long npages) diff --git a/arch/powerpc/platforms/pasemi/iommu.c b/arch/powerpc/platforms/pasemi/iommu.c index 70541b7a5013..a0ff03a3d8da 100644 --- a/arch/powerpc/platforms/pasemi/iommu.c +++ b/arch/powerpc/platforms/pasemi/iommu.c @@ -83,7 +83,7 @@ static u32 *iob_l2_base; static struct iommu_table iommu_table_iobmap; static int iommu_table_iobmap_inited; -static void iobmap_build(struct iommu_table *tbl, long index, +static int iobmap_build(struct iommu_table *tbl, long index, long npages, unsigned long uaddr, enum dma_data_direction direction, struct dma_attrs *attrs) @@ -108,6 +108,7 @@ static void iobmap_build(struct iommu_table *tbl, long index, uaddr += IOBMAP_PAGE_SIZE; bus_addr += IOBMAP_PAGE_SIZE; } + return 0; } diff --git a/arch/powerpc/platforms/pseries/iommu.c b/arch/powerpc/platforms/pseries/iommu.c index 5377dd4b849a..a8c446697f9e 100644 --- a/arch/powerpc/platforms/pseries/iommu.c +++ b/arch/powerpc/platforms/pseries/iommu.c @@ -48,7 +48,7 @@ #include "plpar_wrappers.h" -static void tce_build_pSeries(struct iommu_table *tbl, long index, +static int tce_build_pSeries(struct iommu_table *tbl, long index, long npages, unsigned long uaddr, enum dma_data_direction direction, struct dma_attrs *attrs) @@ -72,6 +72,7 @@ static void tce_build_pSeries(struct iommu_table *tbl, long index, uaddr += TCE_PAGE_SIZE; tcep++; } + return 0; } @@ -94,14 +95,19 @@ static unsigned long tce_get_pseries(struct iommu_table *tbl, long index) return *tcep; } -static void tce_build_pSeriesLP(struct iommu_table *tbl, long tcenum, +static void tce_free_pSeriesLP(struct iommu_table*, long, long); +static void tce_freemulti_pSeriesLP(struct iommu_table*, long, long); + +static int tce_build_pSeriesLP(struct iommu_table *tbl, long tcenum, long npages, unsigned long uaddr, enum dma_data_direction direction, struct dma_attrs *attrs) { - u64 rc; + u64 rc = 0; u64 proto_tce, tce; u64 rpn; + int ret = 0; + long tcenum_start = tcenum, npages_start = npages; rpn = (virt_to_abs(uaddr)) >> TCE_SHIFT; proto_tce = TCE_PCI_READ; @@ -112,6 +118,13 @@ static void tce_build_pSeriesLP(struct iommu_table *tbl, long tcenum, tce = proto_tce | (rpn & TCE_RPN_MASK) << TCE_RPN_SHIFT; rc = plpar_tce_put((u64)tbl->it_index, (u64)tcenum << 12, tce); + if (unlikely(rc == H_NOT_ENOUGH_RESOURCES)) { + ret = (int)rc; + tce_free_pSeriesLP(tbl, tcenum_start, + (npages_start - (npages + 1))); + break; + } + if (rc && printk_ratelimit()) { printk("tce_build_pSeriesLP: plpar_tce_put failed. rc=%ld\n", rc); printk("\tindex = 0x%lx\n", (u64)tbl->it_index); @@ -123,25 +136,27 @@ static void tce_build_pSeriesLP(struct iommu_table *tbl, long tcenum, tcenum++; rpn++; } + return ret; } static DEFINE_PER_CPU(u64 *, tce_page) = NULL; -static void tce_buildmulti_pSeriesLP(struct iommu_table *tbl, long tcenum, +static int tce_buildmulti_pSeriesLP(struct iommu_table *tbl, long tcenum, long npages, unsigned long uaddr, enum dma_data_direction direction, struct dma_attrs *attrs) { - u64 rc; + u64 rc = 0; u64 proto_tce; u64 *tcep; u64 rpn; long l, limit; + long tcenum_start = tcenum, npages_start = npages; + int ret = 0; if (npages == 1) { - tce_build_pSeriesLP(tbl, tcenum, npages, uaddr, - direction, attrs); - return; + return tce_build_pSeriesLP(tbl, tcenum, npages, uaddr, + direction, attrs); } tcep = __get_cpu_var(tce_page); @@ -153,9 +168,8 @@ static void tce_buildmulti_pSeriesLP(struct iommu_table *tbl, long tcenum, tcep = (u64 *)__get_free_page(GFP_ATOMIC); /* If allocation fails, fall back to the loop implementation */ if (!tcep) { - tce_build_pSeriesLP(tbl, tcenum, npages, uaddr, + return tce_build_pSeriesLP(tbl, tcenum, npages, uaddr, direction, attrs); - return; } __get_cpu_var(tce_page) = tcep; } @@ -187,6 +201,13 @@ static void tce_buildmulti_pSeriesLP(struct iommu_table *tbl, long tcenum, tcenum += limit; } while (npages > 0 && !rc); + if (unlikely(rc == H_NOT_ENOUGH_RESOURCES)) { + ret = (int)rc; + tce_freemulti_pSeriesLP(tbl, tcenum_start, + (npages_start - (npages + limit))); + return ret; + } + if (rc && printk_ratelimit()) { printk("tce_buildmulti_pSeriesLP: plpar_tce_put failed. rc=%ld\n", rc); printk("\tindex = 0x%lx\n", (u64)tbl->it_index); @@ -194,6 +215,7 @@ static void tce_buildmulti_pSeriesLP(struct iommu_table *tbl, long tcenum, printk("\ttce[0] val = 0x%lx\n", tcep[0]); show_stack(current, (unsigned long *)__get_SP()); } + return ret; } static void tce_free_pSeriesLP(struct iommu_table *tbl, long tcenum, long npages) diff --git a/arch/powerpc/sysdev/dart_iommu.c b/arch/powerpc/sysdev/dart_iommu.c index de8c8b542cfa..89639ecbf381 100644 --- a/arch/powerpc/sysdev/dart_iommu.c +++ b/arch/powerpc/sysdev/dart_iommu.c @@ -147,7 +147,7 @@ static void dart_flush(struct iommu_table *tbl) } } -static void dart_build(struct iommu_table *tbl, long index, +static int dart_build(struct iommu_table *tbl, long index, long npages, unsigned long uaddr, enum dma_data_direction direction, struct dma_attrs *attrs) @@ -184,6 +184,7 @@ static void dart_build(struct iommu_table *tbl, long index, } else { dart_dirty = 1; } + return 0; } diff --git a/include/asm-powerpc/machdep.h b/include/asm-powerpc/machdep.h index 1233d735fd28..893aafd87fde 100644 --- a/include/asm-powerpc/machdep.h +++ b/include/asm-powerpc/machdep.h @@ -76,7 +76,7 @@ struct machdep_calls { * destroyed as well */ void (*hpte_clear_all)(void); - void (*tce_build)(struct iommu_table * tbl, + int (*tce_build)(struct iommu_table *tbl, long index, long npages, unsigned long uaddr, -- cgit v1.2.3 From a90ab95a9576d35de0d05f9f4fc435edcccafaa9 Mon Sep 17 00:00:00 2001 From: Robert Jennings Date: Thu, 24 Jul 2008 04:31:33 +1000 Subject: powerpc/pseries: vio bus support for CMO This is a large patch but the normal code path is not affected. For non-pSeries platforms the code is ifdef'ed out and for non-CMO enabled pSeries systems this does not affect the normal code path. Devices that do not perform DMA operations do not need modification with this patch. The function get_desired_dma was renamed from get_io_entitlement for clarity. Overview Cooperative Memory Overcommitment (CMO) allows for a set of OS partitions to be run with less RAM than the aggregate needs of the group of partitions. The firmware will balance memory between the partitions and page in/out memory as needed. Based on the number and type of IO adpaters preset each partition is allocated an amount of memory for DMA operations and this allocation will be guaranteed to the partition; this is referred to as the partition's 'entitlement'. Partitions running in a CMO environment can only have virtual IO devices present. The VIO bus layer will manage the IO entitlement for the system. Accounting, at a system and per-device level, is tracked in the VIO bus code and exposed via sysfs. A set of dma_ops functions are added to the bus to allow for this accounting. Bus initialization At initialization, the bus will calculate the minimum needs of the system based on providing each device present with a standard minimum entitlement along with a spare allocation for the bus to handle hotplug events. If the minimum needs can not be met the system boot will be halted. Device changes The significant changes for devices while running under CMO are that the devices must specify how much dedicated IO entitlement they desire and must also handle DMA mapping errors that can occur due to constrained IO memory. The virtual IO drivers are modified to silence errors when DMA mappings fail for CMO and handle these failures gracefully. Each devices will be guaranteed a minimum entitlement that can always be mapped. Devices will specify how much entitlement they desire and the VIO bus will attempt to provide for this. Devices can change their desired entitlement level at any point in time to address particular needs (via vio_cmo_set_dev_desired()), not just at device probe time. VIO bus changes The system will have a particular entitlement level available from which it can provide memory to the devices. The bus defines two pools of memory within this entitlement, the reserved and excess pools. Each device is provided with it's own entitlement no less than a system defined minimum entitlement and no greater than what the device has specified as it's desired entitlement. The entitlement provided to devices comes from the reserve pool. The reserve pool can also contain a spare allocation as large as the system defined minimum entitlement which is used for device hotplug events. Any entitlement not needed to fulfill the needs of a reserve pool is placed in the excess pool. Each device is guaranteed that it can map up to it's entitled level; additional mapping are possible as long as there is unmapped memory in the excess pool. Bus probe As the system starts, each device is given an entitlement equal only to the system defined minimum entitlement. The reserve pool is equal to the sum of these entitlements, plus a spare allocation. The VIO bus also tracks the aggregate desired entitlement of all the devices. If the system desired entitlement is greater than the size of the reserve pool, when devices unmap IO memory it will be reserved and a balance operation will be scheduled for some time in the future. Entitlement balancing The balance function tries to fairly distribute entitlement between the devices in the system with the goal of providing each device with it's desired amount of entitlement. Devices using more than what would be ideal will have their entitled set-point adjusted; this will effectively set a goal for lower IO memory usage as future mappings can fail and deallocations will trigger a balance operation to distribute the newly unmapped memory. A fair distribution of entitlement can take several balance operations to achieve. Entitlement changes and device DLPAR events will alter the state of CMO and will trigger balance operations. Hotplug events The VIO bus allows for changes in system entitlement at run-time via 'vio_cmo_entitlement_update()'. When devices are added the hotplug device event will be preceded by a system entitlement increase and this is reversed when devices are removed. The following changes are made that the VIO bus layer for CMO: * add IO memory accounting per device structure. * add IO memory entitlement query function to driver structure. * during vio bus probe, if CMO is enabled, check that driver has memory entitlement query function defined. Fail if function not defined. * fail to register driver if io entitlement function not defined. * create set of dma_ops at vio level for CMO that will track allocations and return DMA failures once entitlement is reached. Entitlement will limited by overall system entitlement. Devices will have a reserved quantity of memory that is guaranteed, the rest can be used as available. * expose entitlement, current allocation, desired allocation, and the allocation error counter for devices to the user through sysfs * provide mechanism for changing a device's desired entitlement at run time for devices as an exported function and sysfs tunable * track any DMA failures for entitled IO memory for each vio device. * check entitlement against available system entitlement on device add * track entitlement metrics (high water mark, current usage) * provide function to reset high water mark * provide minimum and desired entitlement numbers at a bus level * provide drivers with a minimum guaranteed entitlement * balance available entitlement between devices to satisfy their needs * handle system entitlement changes and device hotplug Signed-off-by: Robert Jennings Acked-by: Paul Mackerras Signed-off-by: Benjamin Herrenschmidt --- arch/powerpc/kernel/vio.c | 1033 ++++++++++++++++++++++++++++++++++++++++++++- include/asm-powerpc/vio.h | 27 +- 2 files changed, 1052 insertions(+), 8 deletions(-) (limited to 'include') diff --git a/arch/powerpc/kernel/vio.c b/arch/powerpc/kernel/vio.c index b77f8af7ddde..ade8aeaa2e70 100644 --- a/arch/powerpc/kernel/vio.c +++ b/arch/powerpc/kernel/vio.c @@ -1,11 +1,12 @@ /* * IBM PowerPC Virtual I/O Infrastructure Support. * - * Copyright (c) 2003-2005 IBM Corp. + * Copyright (c) 2003,2008 IBM Corp. * Dave Engebretsen engebret@us.ibm.com * Santiago Leon santil@us.ibm.com * Hollis Blanchard * Stephen Rothwell + * Robert Jennings * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License @@ -46,6 +47,996 @@ static struct vio_dev vio_bus_device = { /* fake "parent" device */ .dev.bus = &vio_bus_type, }; +#ifdef CONFIG_PPC_SMLPAR +/** + * vio_cmo_pool - A pool of IO memory for CMO use + * + * @size: The size of the pool in bytes + * @free: The amount of free memory in the pool + */ +struct vio_cmo_pool { + size_t size; + size_t free; +}; + +/* How many ms to delay queued balance work */ +#define VIO_CMO_BALANCE_DELAY 100 + +/* Portion out IO memory to CMO devices by this chunk size */ +#define VIO_CMO_BALANCE_CHUNK 131072 + +/** + * vio_cmo_dev_entry - A device that is CMO-enabled and requires entitlement + * + * @vio_dev: struct vio_dev pointer + * @list: pointer to other devices on bus that are being tracked + */ +struct vio_cmo_dev_entry { + struct vio_dev *viodev; + struct list_head list; +}; + +/** + * vio_cmo - VIO bus accounting structure for CMO entitlement + * + * @lock: spinlock for entire structure + * @balance_q: work queue for balancing system entitlement + * @device_list: list of CMO-enabled devices requiring entitlement + * @entitled: total system entitlement in bytes + * @reserve: pool of memory from which devices reserve entitlement, incl. spare + * @excess: pool of excess entitlement not needed for device reserves or spare + * @spare: IO memory for device hotplug functionality + * @min: minimum necessary for system operation + * @desired: desired memory for system operation + * @curr: bytes currently allocated + * @high: high water mark for IO data usage + */ +struct vio_cmo { + spinlock_t lock; + struct delayed_work balance_q; + struct list_head device_list; + size_t entitled; + struct vio_cmo_pool reserve; + struct vio_cmo_pool excess; + size_t spare; + size_t min; + size_t desired; + size_t curr; + size_t high; +} vio_cmo; + +/** + * vio_cmo_OF_devices - Count the number of OF devices that have DMA windows + */ +static int vio_cmo_num_OF_devs(void) +{ + struct device_node *node_vroot; + int count = 0; + + /* + * Count the number of vdevice entries with an + * ibm,my-dma-window OF property + */ + node_vroot = of_find_node_by_name(NULL, "vdevice"); + if (node_vroot) { + struct device_node *of_node; + struct property *prop; + + for_each_child_of_node(node_vroot, of_node) { + prop = of_find_property(of_node, "ibm,my-dma-window", + NULL); + if (prop) + count++; + } + } + of_node_put(node_vroot); + return count; +} + +/** + * vio_cmo_alloc - allocate IO memory for CMO-enable devices + * + * @viodev: VIO device requesting IO memory + * @size: size of allocation requested + * + * Allocations come from memory reserved for the devices and any excess + * IO memory available to all devices. The spare pool used to service + * hotplug must be equal to %VIO_CMO_MIN_ENT for the excess pool to be + * made available. + * + * Return codes: + * 0 for successful allocation and -ENOMEM for a failure + */ +static inline int vio_cmo_alloc(struct vio_dev *viodev, size_t size) +{ + unsigned long flags; + size_t reserve_free = 0; + size_t excess_free = 0; + int ret = -ENOMEM; + + spin_lock_irqsave(&vio_cmo.lock, flags); + + /* Determine the amount of free entitlement available in reserve */ + if (viodev->cmo.entitled > viodev->cmo.allocated) + reserve_free = viodev->cmo.entitled - viodev->cmo.allocated; + + /* If spare is not fulfilled, the excess pool can not be used. */ + if (vio_cmo.spare >= VIO_CMO_MIN_ENT) + excess_free = vio_cmo.excess.free; + + /* The request can be satisfied */ + if ((reserve_free + excess_free) >= size) { + vio_cmo.curr += size; + if (vio_cmo.curr > vio_cmo.high) + vio_cmo.high = vio_cmo.curr; + viodev->cmo.allocated += size; + size -= min(reserve_free, size); + vio_cmo.excess.free -= size; + ret = 0; + } + + spin_unlock_irqrestore(&vio_cmo.lock, flags); + return ret; +} + +/** + * vio_cmo_dealloc - deallocate IO memory from CMO-enable devices + * @viodev: VIO device freeing IO memory + * @size: size of deallocation + * + * IO memory is freed by the device back to the correct memory pools. + * The spare pool is replenished first from either memory pool, then + * the reserve pool is used to reduce device entitlement, the excess + * pool is used to increase the reserve pool toward the desired entitlement + * target, and then the remaining memory is returned to the pools. + * + */ +static inline void vio_cmo_dealloc(struct vio_dev *viodev, size_t size) +{ + unsigned long flags; + size_t spare_needed = 0; + size_t excess_freed = 0; + size_t reserve_freed = size; + size_t tmp; + int balance = 0; + + spin_lock_irqsave(&vio_cmo.lock, flags); + vio_cmo.curr -= size; + + /* Amount of memory freed from the excess pool */ + if (viodev->cmo.allocated > viodev->cmo.entitled) { + excess_freed = min(reserve_freed, (viodev->cmo.allocated - + viodev->cmo.entitled)); + reserve_freed -= excess_freed; + } + + /* Remove allocation from device */ + viodev->cmo.allocated -= (reserve_freed + excess_freed); + + /* Spare is a subset of the reserve pool, replenish it first. */ + spare_needed = VIO_CMO_MIN_ENT - vio_cmo.spare; + + /* + * Replenish the spare in the reserve pool from the excess pool. + * This moves entitlement into the reserve pool. + */ + if (spare_needed && excess_freed) { + tmp = min(excess_freed, spare_needed); + vio_cmo.excess.size -= tmp; + vio_cmo.reserve.size += tmp; + vio_cmo.spare += tmp; + excess_freed -= tmp; + spare_needed -= tmp; + balance = 1; + } + + /* + * Replenish the spare in the reserve pool from the reserve pool. + * This removes entitlement from the device down to VIO_CMO_MIN_ENT, + * if needed, and gives it to the spare pool. The amount of used + * memory in this pool does not change. + */ + if (spare_needed && reserve_freed) { + tmp = min(spare_needed, min(reserve_freed, + (viodev->cmo.entitled - + VIO_CMO_MIN_ENT))); + + vio_cmo.spare += tmp; + viodev->cmo.entitled -= tmp; + reserve_freed -= tmp; + spare_needed -= tmp; + balance = 1; + } + + /* + * Increase the reserve pool until the desired allocation is met. + * Move an allocation freed from the excess pool into the reserve + * pool and schedule a balance operation. + */ + if (excess_freed && (vio_cmo.desired > vio_cmo.reserve.size)) { + tmp = min(excess_freed, (vio_cmo.desired - vio_cmo.reserve.size)); + + vio_cmo.excess.size -= tmp; + vio_cmo.reserve.size += tmp; + excess_freed -= tmp; + balance = 1; + } + + /* Return memory from the excess pool to that pool */ + if (excess_freed) + vio_cmo.excess.free += excess_freed; + + if (balance) + schedule_delayed_work(&vio_cmo.balance_q, VIO_CMO_BALANCE_DELAY); + spin_unlock_irqrestore(&vio_cmo.lock, flags); +} + +/** + * vio_cmo_entitlement_update - Manage system entitlement changes + * + * @new_entitlement: new system entitlement to attempt to accommodate + * + * Increases in entitlement will be used to fulfill the spare entitlement + * and the rest is given to the excess pool. Decreases, if they are + * possible, come from the excess pool and from unused device entitlement + * + * Returns: 0 on success, -ENOMEM when change can not be made + */ +int vio_cmo_entitlement_update(size_t new_entitlement) +{ + struct vio_dev *viodev; + struct vio_cmo_dev_entry *dev_ent; + unsigned long flags; + size_t avail, delta, tmp; + + spin_lock_irqsave(&vio_cmo.lock, flags); + + /* Entitlement increases */ + if (new_entitlement > vio_cmo.entitled) { + delta = new_entitlement - vio_cmo.entitled; + + /* Fulfill spare allocation */ + if (vio_cmo.spare < VIO_CMO_MIN_ENT) { + tmp = min(delta, (VIO_CMO_MIN_ENT - vio_cmo.spare)); + vio_cmo.spare += tmp; + vio_cmo.reserve.size += tmp; + delta -= tmp; + } + + /* Remaining new allocation goes to the excess pool */ + vio_cmo.entitled += delta; + vio_cmo.excess.size += delta; + vio_cmo.excess.free += delta; + + goto out; + } + + /* Entitlement decreases */ + delta = vio_cmo.entitled - new_entitlement; + avail = vio_cmo.excess.free; + + /* + * Need to check how much unused entitlement each device can + * sacrifice to fulfill entitlement change. + */ + list_for_each_entry(dev_ent, &vio_cmo.device_list, list) { + if (avail >= delta) + break; + + viodev = dev_ent->viodev; + if ((viodev->cmo.entitled > viodev->cmo.allocated) && + (viodev->cmo.entitled > VIO_CMO_MIN_ENT)) + avail += viodev->cmo.entitled - + max_t(size_t, viodev->cmo.allocated, + VIO_CMO_MIN_ENT); + } + + if (delta <= avail) { + vio_cmo.entitled -= delta; + + /* Take entitlement from the excess pool first */ + tmp = min(vio_cmo.excess.free, delta); + vio_cmo.excess.size -= tmp; + vio_cmo.excess.free -= tmp; + delta -= tmp; + + /* + * Remove all but VIO_CMO_MIN_ENT bytes from devices + * until entitlement change is served + */ + list_for_each_entry(dev_ent, &vio_cmo.device_list, list) { + if (!delta) + break; + + viodev = dev_ent->viodev; + tmp = 0; + if ((viodev->cmo.entitled > viodev->cmo.allocated) && + (viodev->cmo.entitled > VIO_CMO_MIN_ENT)) + tmp = viodev->cmo.entitled - + max_t(size_t, viodev->cmo.allocated, + VIO_CMO_MIN_ENT); + viodev->cmo.entitled -= min(tmp, delta); + delta -= min(tmp, delta); + } + } else { + spin_unlock_irqrestore(&vio_cmo.lock, flags); + return -ENOMEM; + } + +out: + schedule_delayed_work(&vio_cmo.balance_q, 0); + spin_unlock_irqrestore(&vio_cmo.lock, flags); + return 0; +} + +/** + * vio_cmo_balance - Balance entitlement among devices + * + * @work: work queue structure for this operation + * + * Any system entitlement above the minimum needed for devices, or + * already allocated to devices, can be distributed to the devices. + * The list of devices is iterated through to recalculate the desired + * entitlement level and to determine how much entitlement above the + * minimum entitlement is allocated to devices. + * + * Small chunks of the available entitlement are given to devices until + * their requirements are fulfilled or there is no entitlement left to give. + * Upon completion sizes of the reserve and excess pools are calculated. + * + * The system minimum entitlement level is also recalculated here. + * Entitlement will be reserved for devices even after vio_bus_remove to + * accommodate reloading the driver. The OF tree is walked to count the + * number of devices present and this will remove entitlement for devices + * that have actually left the system after having vio_bus_remove called. + */ +static void vio_cmo_balance(struct work_struct *work) +{ + struct vio_cmo *cmo; + struct vio_dev *viodev; + struct vio_cmo_dev_entry *dev_ent; + unsigned long flags; + size_t avail = 0, level, chunk, need; + int devcount = 0, fulfilled; + + cmo = container_of(work, struct vio_cmo, balance_q.work); + + spin_lock_irqsave(&vio_cmo.lock, flags); + + /* Calculate minimum entitlement and fulfill spare */ + cmo->min = vio_cmo_num_OF_devs() * VIO_CMO_MIN_ENT; + BUG_ON(cmo->min > cmo->entitled); + cmo->spare = min_t(size_t, VIO_CMO_MIN_ENT, (cmo->entitled - cmo->min)); + cmo->min += cmo->spare; + cmo->desired = cmo->min; + + /* + * Determine how much entitlement is available and reset device + * entitlements + */ + avail = cmo->entitled - cmo->spare; + list_for_each_entry(dev_ent, &vio_cmo.device_list, list) { + viodev = dev_ent->viodev; + devcount++; + viodev->cmo.entitled = VIO_CMO_MIN_ENT; + cmo->desired += (viodev->cmo.desired - VIO_CMO_MIN_ENT); + avail -= max_t(size_t, viodev->cmo.allocated, VIO_CMO_MIN_ENT); + } + + /* + * Having provided each device with the minimum entitlement, loop + * over the devices portioning out the remaining entitlement + * until there is nothing left. + */ + level = VIO_CMO_MIN_ENT; + while (avail) { + fulfilled = 0; + list_for_each_entry(dev_ent, &vio_cmo.device_list, list) { + viodev = dev_ent->viodev; + + if (viodev->cmo.desired <= level) { + fulfilled++; + continue; + } + + /* + * Give the device up to VIO_CMO_BALANCE_CHUNK + * bytes of entitlement, but do not exceed the + * desired level of entitlement for the device. + */ + chunk = min_t(size_t, avail, VIO_CMO_BALANCE_CHUNK); + chunk = min(chunk, (viodev->cmo.desired - + viodev->cmo.entitled)); + viodev->cmo.entitled += chunk; + + /* + * If the memory for this entitlement increase was + * already allocated to the device it does not come + * from the available pool being portioned out. + */ + need = max(viodev->cmo.allocated, viodev->cmo.entitled)- + max(viodev->cmo.allocated, level); + avail -= need; + + } + if (fulfilled == devcount) + break; + level += VIO_CMO_BALANCE_CHUNK; + } + + /* Calculate new reserve and excess pool sizes */ + cmo->reserve.size = cmo->min; + cmo->excess.free = 0; + cmo->excess.size = 0; + need = 0; + list_for_each_entry(dev_ent, &vio_cmo.device_list, list) { + viodev = dev_ent->viodev; + /* Calculated reserve size above the minimum entitlement */ + if (viodev->cmo.entitled) + cmo->reserve.size += (viodev->cmo.entitled - + VIO_CMO_MIN_ENT); + /* Calculated used excess entitlement */ + if (viodev->cmo.allocated > viodev->cmo.entitled) + need += viodev->cmo.allocated - viodev->cmo.entitled; + } + cmo->excess.size = cmo->entitled - cmo->reserve.size; + cmo->excess.free = cmo->excess.size - need; + + cancel_delayed_work(container_of(work, struct delayed_work, work)); + spin_unlock_irqrestore(&vio_cmo.lock, flags); +} + +static void *vio_dma_iommu_alloc_coherent(struct device *dev, size_t size, + dma_addr_t *dma_handle, gfp_t flag) +{ + struct vio_dev *viodev = to_vio_dev(dev); + void *ret; + + if (vio_cmo_alloc(viodev, roundup(size, IOMMU_PAGE_SIZE))) { + atomic_inc(&viodev->cmo.allocs_failed); + return NULL; + } + + ret = dma_iommu_ops.alloc_coherent(dev, size, dma_handle, flag); + if (unlikely(ret == NULL)) { + vio_cmo_dealloc(viodev, roundup(size, IOMMU_PAGE_SIZE)); + atomic_inc(&viodev->cmo.allocs_failed); + } + + return ret; +} + +static void vio_dma_iommu_free_coherent(struct device *dev, size_t size, + void *vaddr, dma_addr_t dma_handle) +{ + struct vio_dev *viodev = to_vio_dev(dev); + + dma_iommu_ops.free_coherent(dev, size, vaddr, dma_handle); + + vio_cmo_dealloc(viodev, roundup(size, IOMMU_PAGE_SIZE)); +} + +static dma_addr_t vio_dma_iommu_map_single(struct device *dev, void *vaddr, + size_t size, + enum dma_data_direction direction, + struct dma_attrs *attrs) +{ + struct vio_dev *viodev = to_vio_dev(dev); + dma_addr_t ret = DMA_ERROR_CODE; + + if (vio_cmo_alloc(viodev, roundup(size, IOMMU_PAGE_SIZE))) { + atomic_inc(&viodev->cmo.allocs_failed); + return ret; + } + + ret = dma_iommu_ops.map_single(dev, vaddr, size, direction, attrs); + if (unlikely(dma_mapping_error(ret))) { + vio_cmo_dealloc(viodev, roundup(size, IOMMU_PAGE_SIZE)); + atomic_inc(&viodev->cmo.allocs_failed); + } + + return ret; +} + +static void vio_dma_iommu_unmap_single(struct device *dev, + dma_addr_t dma_handle, size_t size, + enum dma_data_direction direction, + struct dma_attrs *attrs) +{ + struct vio_dev *viodev = to_vio_dev(dev); + + dma_iommu_ops.unmap_single(dev, dma_handle, size, direction, attrs); + + vio_cmo_dealloc(viodev, roundup(size, IOMMU_PAGE_SIZE)); +} + +static int vio_dma_iommu_map_sg(struct device *dev, struct scatterlist *sglist, + int nelems, enum dma_data_direction direction, + struct dma_attrs *attrs) +{ + struct vio_dev *viodev = to_vio_dev(dev); + struct scatterlist *sgl; + int ret, count = 0; + size_t alloc_size = 0; + + for (sgl = sglist; count < nelems; count++, sgl++) + alloc_size += roundup(sgl->length, IOMMU_PAGE_SIZE); + + if (vio_cmo_alloc(viodev, alloc_size)) { + atomic_inc(&viodev->cmo.allocs_failed); + return 0; + } + + ret = dma_iommu_ops.map_sg(dev, sglist, nelems, direction, attrs); + + if (unlikely(!ret)) { + vio_cmo_dealloc(viodev, alloc_size); + atomic_inc(&viodev->cmo.allocs_failed); + } + + for (sgl = sglist, count = 0; count < ret; count++, sgl++) + alloc_size -= roundup(sgl->dma_length, IOMMU_PAGE_SIZE); + if (alloc_size) + vio_cmo_dealloc(viodev, alloc_size); + + return ret; +} + +static void vio_dma_iommu_unmap_sg(struct device *dev, + struct scatterlist *sglist, int nelems, + enum dma_data_direction direction, + struct dma_attrs *attrs) +{ + struct vio_dev *viodev = to_vio_dev(dev); + struct scatterlist *sgl; + size_t alloc_size = 0; + int count = 0; + + for (sgl = sglist; count < nelems; count++, sgl++) + alloc_size += roundup(sgl->dma_length, IOMMU_PAGE_SIZE); + + dma_iommu_ops.unmap_sg(dev, sglist, nelems, direction, attrs); + + vio_cmo_dealloc(viodev, alloc_size); +} + +struct dma_mapping_ops vio_dma_mapping_ops = { + .alloc_coherent = vio_dma_iommu_alloc_coherent, + .free_coherent = vio_dma_iommu_free_coherent, + .map_single = vio_dma_iommu_map_single, + .unmap_single = vio_dma_iommu_unmap_single, + .map_sg = vio_dma_iommu_map_sg, + .unmap_sg = vio_dma_iommu_unmap_sg, +}; + +/** + * vio_cmo_set_dev_desired - Set desired entitlement for a device + * + * @viodev: struct vio_dev for device to alter + * @new_desired: new desired entitlement level in bytes + * + * For use by devices to request a change to their entitlement at runtime or + * through sysfs. The desired entitlement level is changed and a balancing + * of system resources is scheduled to run in the future. + */ +void vio_cmo_set_dev_desired(struct vio_dev *viodev, size_t desired) +{ + unsigned long flags; + struct vio_cmo_dev_entry *dev_ent; + int found = 0; + + if (!firmware_has_feature(FW_FEATURE_CMO)) + return; + + spin_lock_irqsave(&vio_cmo.lock, flags); + if (desired < VIO_CMO_MIN_ENT) + desired = VIO_CMO_MIN_ENT; + + /* + * Changes will not be made for devices not in the device list. + * If it is not in the device list, then no driver is loaded + * for the device and it can not receive entitlement. + */ + list_for_each_entry(dev_ent, &vio_cmo.device_list, list) + if (viodev == dev_ent->viodev) { + found = 1; + break; + } + if (!found) + return; + + /* Increase/decrease in desired device entitlement */ + if (desired >= viodev->cmo.desired) { + /* Just bump the bus and device values prior to a balance*/ + vio_cmo.desired += desired - viodev->cmo.desired; + viodev->cmo.desired = desired; + } else { + /* Decrease bus and device values for desired entitlement */ + vio_cmo.desired -= viodev->cmo.desired - desired; + viodev->cmo.desired = desired; + /* + * If less entitlement is desired than current entitlement, move + * any reserve memory in the change region to the excess pool. + */ + if (viodev->cmo.entitled > desired) { + vio_cmo.reserve.size -= viodev->cmo.entitled - desired; + vio_cmo.excess.size += viodev->cmo.entitled - desired; + /* + * If entitlement moving from the reserve pool to the + * excess pool is currently unused, add to the excess + * free counter. + */ + if (viodev->cmo.allocated < viodev->cmo.entitled) + vio_cmo.excess.free += viodev->cmo.entitled - + max(viodev->cmo.allocated, desired); + viodev->cmo.entitled = desired; + } + } + schedule_delayed_work(&vio_cmo.balance_q, 0); + spin_unlock_irqrestore(&vio_cmo.lock, flags); +} + +/** + * vio_cmo_bus_probe - Handle CMO specific bus probe activities + * + * @viodev - Pointer to struct vio_dev for device + * + * Determine the devices IO memory entitlement needs, attempting + * to satisfy the system minimum entitlement at first and scheduling + * a balance operation to take care of the rest at a later time. + * + * Returns: 0 on success, -EINVAL when device doesn't support CMO, and + * -ENOMEM when entitlement is not available for device or + * device entry. + * + */ +static int vio_cmo_bus_probe(struct vio_dev *viodev) +{ + struct vio_cmo_dev_entry *dev_ent; + struct device *dev = &viodev->dev; + struct vio_driver *viodrv = to_vio_driver(dev->driver); + unsigned long flags; + size_t size; + + /* + * Check to see that device has a DMA window and configure + * entitlement for the device. + */ + if (of_get_property(viodev->dev.archdata.of_node, + "ibm,my-dma-window", NULL)) { + /* Check that the driver is CMO enabled and get desired DMA */ + if (!viodrv->get_desired_dma) { + dev_err(dev, "%s: device driver does not support CMO\n", + __func__); + return -EINVAL; + } + + viodev->cmo.desired = IOMMU_PAGE_ALIGN(viodrv->get_desired_dma(viodev)); + if (viodev->cmo.desired < VIO_CMO_MIN_ENT) + viodev->cmo.desired = VIO_CMO_MIN_ENT; + size = VIO_CMO_MIN_ENT; + + dev_ent = kmalloc(sizeof(struct vio_cmo_dev_entry), + GFP_KERNEL); + if (!dev_ent) + return -ENOMEM; + + dev_ent->viodev = viodev; + spin_lock_irqsave(&vio_cmo.lock, flags); + list_add(&dev_ent->list, &vio_cmo.device_list); + } else { + viodev->cmo.desired = 0; + size = 0; + spin_lock_irqsave(&vio_cmo.lock, flags); + } + + /* + * If the needs for vio_cmo.min have not changed since they + * were last set, the number of devices in the OF tree has + * been constant and the IO memory for this is already in + * the reserve pool. + */ + if (vio_cmo.min == ((vio_cmo_num_OF_devs() + 1) * + VIO_CMO_MIN_ENT)) { + /* Updated desired entitlement if device requires it */ + if (size) + vio_cmo.desired += (viodev->cmo.desired - + VIO_CMO_MIN_ENT); + } else { + size_t tmp; + + tmp = vio_cmo.spare + vio_cmo.excess.free; + if (tmp < size) { + dev_err(dev, "%s: insufficient free " + "entitlement to add device. " + "Need %lu, have %lu\n", __func__, + size, (vio_cmo.spare + tmp)); + spin_unlock_irqrestore(&vio_cmo.lock, flags); + return -ENOMEM; + } + + /* Use excess pool first to fulfill request */ + tmp = min(size, vio_cmo.excess.free); + vio_cmo.excess.free -= tmp; + vio_cmo.excess.size -= tmp; + vio_cmo.reserve.size += tmp; + + /* Use spare if excess pool was insufficient */ + vio_cmo.spare -= size - tmp; + + /* Update bus accounting */ + vio_cmo.min += size; + vio_cmo.desired += viodev->cmo.desired; + } + spin_unlock_irqrestore(&vio_cmo.lock, flags); + return 0; +} + +/** + * vio_cmo_bus_remove - Handle CMO specific bus removal activities + * + * @viodev - Pointer to struct vio_dev for device + * + * Remove the device from the cmo device list. The minimum entitlement + * will be reserved for the device as long as it is in the system. The + * rest of the entitlement the device had been allocated will be returned + * to the system. + */ +static void vio_cmo_bus_remove(struct vio_dev *viodev) +{ + struct vio_cmo_dev_entry *dev_ent; + unsigned long flags; + size_t tmp; + + spin_lock_irqsave(&vio_cmo.lock, flags); + if (viodev->cmo.allocated) { + dev_err(&viodev->dev, "%s: device had %lu bytes of IO " + "allocated after remove operation.\n", + __func__, viodev->cmo.allocated); + BUG(); + } + + /* + * Remove the device from the device list being maintained for + * CMO enabled devices. + */ + list_for_each_entry(dev_ent, &vio_cmo.device_list, list) + if (viodev == dev_ent->viodev) { + list_del(&dev_ent->list); + kfree(dev_ent); + break; + } + + /* + * Devices may not require any entitlement and they do not need + * to be processed. Otherwise, return the device's entitlement + * back to the pools. + */ + if (viodev->cmo.entitled) { + /* + * This device has not yet left the OF tree, it's + * minimum entitlement remains in vio_cmo.min and + * vio_cmo.desired + */ + vio_cmo.desired -= (viodev->cmo.desired - VIO_CMO_MIN_ENT); + + /* + * Save min allocation for device in reserve as long + * as it exists in OF tree as determined by later + * balance operation + */ + viodev->cmo.entitled -= VIO_CMO_MIN_ENT; + + /* Replenish spare from freed reserve pool */ + if (viodev->cmo.entitled && (vio_cmo.spare < VIO_CMO_MIN_ENT)) { + tmp = min(viodev->cmo.entitled, (VIO_CMO_MIN_ENT - + vio_cmo.spare)); + vio_cmo.spare += tmp; + viodev->cmo.entitled -= tmp; + } + + /* Remaining reserve goes to excess pool */ + vio_cmo.excess.size += viodev->cmo.entitled; + vio_cmo.excess.free += viodev->cmo.entitled; + vio_cmo.reserve.size -= viodev->cmo.entitled; + + /* + * Until the device is removed it will keep a + * minimum entitlement; this will guarantee that + * a module unload/load will result in a success. + */ + viodev->cmo.entitled = VIO_CMO_MIN_ENT; + viodev->cmo.desired = VIO_CMO_MIN_ENT; + atomic_set(&viodev->cmo.allocs_failed, 0); + } + + spin_unlock_irqrestore(&vio_cmo.lock, flags); +} + +static void vio_cmo_set_dma_ops(struct vio_dev *viodev) +{ + vio_dma_mapping_ops.dma_supported = dma_iommu_ops.dma_supported; + viodev->dev.archdata.dma_ops = &vio_dma_mapping_ops; +} + +/** + * vio_cmo_bus_init - CMO entitlement initialization at bus init time + * + * Set up the reserve and excess entitlement pools based on available + * system entitlement and the number of devices in the OF tree that + * require entitlement in the reserve pool. + */ +static void vio_cmo_bus_init(void) +{ + struct hvcall_mpp_data mpp_data; + int err; + + memset(&vio_cmo, 0, sizeof(struct vio_cmo)); + spin_lock_init(&vio_cmo.lock); + INIT_LIST_HEAD(&vio_cmo.device_list); + INIT_DELAYED_WORK(&vio_cmo.balance_q, vio_cmo_balance); + + /* Get current system entitlement */ + err = h_get_mpp(&mpp_data); + + /* + * On failure, continue with entitlement set to 0, will panic() + * later when spare is reserved. + */ + if (err != H_SUCCESS) { + printk(KERN_ERR "%s: unable to determine system IO "\ + "entitlement. (%d)\n", __func__, err); + vio_cmo.entitled = 0; + } else { + vio_cmo.entitled = mpp_data.entitled_mem; + } + + /* Set reservation and check against entitlement */ + vio_cmo.spare = VIO_CMO_MIN_ENT; + vio_cmo.reserve.size = vio_cmo.spare; + vio_cmo.reserve.size += (vio_cmo_num_OF_devs() * + VIO_CMO_MIN_ENT); + if (vio_cmo.reserve.size > vio_cmo.entitled) { + printk(KERN_ERR "%s: insufficient system entitlement\n", + __func__); + panic("%s: Insufficient system entitlement", __func__); + } + + /* Set the remaining accounting variables */ + vio_cmo.excess.size = vio_cmo.entitled - vio_cmo.reserve.size; + vio_cmo.excess.free = vio_cmo.excess.size; + vio_cmo.min = vio_cmo.reserve.size; + vio_cmo.desired = vio_cmo.reserve.size; +} + +/* sysfs device functions and data structures for CMO */ + +#define viodev_cmo_rd_attr(name) \ +static ssize_t viodev_cmo_##name##_show(struct device *dev, \ + struct device_attribute *attr, \ + char *buf) \ +{ \ + return sprintf(buf, "%lu\n", to_vio_dev(dev)->cmo.name); \ +} + +static ssize_t viodev_cmo_allocs_failed_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct vio_dev *viodev = to_vio_dev(dev); + return sprintf(buf, "%d\n", atomic_read(&viodev->cmo.allocs_failed)); +} + +static ssize_t viodev_cmo_allocs_failed_reset(struct device *dev, + struct device_attribute *attr, const char *buf, size_t count) +{ + struct vio_dev *viodev = to_vio_dev(dev); + atomic_set(&viodev->cmo.allocs_failed, 0); + return count; +} + +static ssize_t viodev_cmo_desired_set(struct device *dev, + struct device_attribute *attr, const char *buf, size_t count) +{ + struct vio_dev *viodev = to_vio_dev(dev); + size_t new_desired; + int ret; + + ret = strict_strtoul(buf, 10, &new_desired); + if (ret) + return ret; + + vio_cmo_set_dev_desired(viodev, new_desired); + return count; +} + +viodev_cmo_rd_attr(desired); +viodev_cmo_rd_attr(entitled); +viodev_cmo_rd_attr(allocated); + +static ssize_t name_show(struct device *, struct device_attribute *, char *); +static ssize_t devspec_show(struct device *, struct device_attribute *, char *); +static struct device_attribute vio_cmo_dev_attrs[] = { + __ATTR_RO(name), + __ATTR_RO(devspec), + __ATTR(cmo_desired, S_IWUSR|S_IRUSR|S_IWGRP|S_IRGRP|S_IROTH, + viodev_cmo_desired_show, viodev_cmo_desired_set), + __ATTR(cmo_entitled, S_IRUGO, viodev_cmo_entitled_show, NULL), + __ATTR(cmo_allocated, S_IRUGO, viodev_cmo_allocated_show, NULL), + __ATTR(cmo_allocs_failed, S_IWUSR|S_IRUSR|S_IWGRP|S_IRGRP|S_IROTH, + viodev_cmo_allocs_failed_show, viodev_cmo_allocs_failed_reset), + __ATTR_NULL +}; + +/* sysfs bus functions and data structures for CMO */ + +#define viobus_cmo_rd_attr(name) \ +static ssize_t \ +viobus_cmo_##name##_show(struct bus_type *bt, char *buf) \ +{ \ + return sprintf(buf, "%lu\n", vio_cmo.name); \ +} + +#define viobus_cmo_pool_rd_attr(name, var) \ +static ssize_t \ +viobus_cmo_##name##_pool_show_##var(struct bus_type *bt, char *buf) \ +{ \ + return sprintf(buf, "%lu\n", vio_cmo.name.var); \ +} + +static ssize_t viobus_cmo_high_reset(struct bus_type *bt, const char *buf, + size_t count) +{ + unsigned long flags; + + spin_lock_irqsave(&vio_cmo.lock, flags); + vio_cmo.high = vio_cmo.curr; + spin_unlock_irqrestore(&vio_cmo.lock, flags); + + return count; +} + +viobus_cmo_rd_attr(entitled); +viobus_cmo_pool_rd_attr(reserve, size); +viobus_cmo_pool_rd_attr(excess, size); +viobus_cmo_pool_rd_attr(excess, free); +viobus_cmo_rd_attr(spare); +viobus_cmo_rd_attr(min); +viobus_cmo_rd_attr(desired); +viobus_cmo_rd_attr(curr); +viobus_cmo_rd_attr(high); + +static struct bus_attribute vio_cmo_bus_attrs[] = { + __ATTR(cmo_entitled, S_IRUGO, viobus_cmo_entitled_show, NULL), + __ATTR(cmo_reserve_size, S_IRUGO, viobus_cmo_reserve_pool_show_size, NULL), + __ATTR(cmo_excess_size, S_IRUGO, viobus_cmo_excess_pool_show_size, NULL), + __ATTR(cmo_excess_free, S_IRUGO, viobus_cmo_excess_pool_show_free, NULL), + __ATTR(cmo_spare, S_IRUGO, viobus_cmo_spare_show, NULL), + __ATTR(cmo_min, S_IRUGO, viobus_cmo_min_show, NULL), + __ATTR(cmo_desired, S_IRUGO, viobus_cmo_desired_show, NULL), + __ATTR(cmo_curr, S_IRUGO, viobus_cmo_curr_show, NULL), + __ATTR(cmo_high, S_IWUSR|S_IRUSR|S_IWGRP|S_IRGRP|S_IROTH, + viobus_cmo_high_show, viobus_cmo_high_reset), + __ATTR_NULL +}; + +static void vio_cmo_sysfs_init(void) +{ + vio_bus_type.dev_attrs = vio_cmo_dev_attrs; + vio_bus_type.bus_attrs = vio_cmo_bus_attrs; +} +#else /* CONFIG_PPC_SMLPAR */ +/* Dummy functions for iSeries platform */ +int vio_cmo_entitlement_update(size_t new_entitlement) { return 0; } +void vio_cmo_set_dev_desired(struct vio_dev *viodev, size_t desired) {} +static int vio_cmo_bus_probe(struct vio_dev *viodev) { return 0; } +static void vio_cmo_bus_remove(struct vio_dev *viodev) {} +static void vio_cmo_set_dma_ops(struct vio_dev *viodev) {} +static void vio_cmo_bus_init() {} +static void vio_cmo_sysfs_init() { } +#endif /* CONFIG_PPC_SMLPAR */ +EXPORT_SYMBOL(vio_cmo_entitlement_update); +EXPORT_SYMBOL(vio_cmo_set_dev_desired); + static struct iommu_table *vio_build_iommu_table(struct vio_dev *dev) { const unsigned char *dma_window; @@ -114,8 +1105,17 @@ static int vio_bus_probe(struct device *dev) return error; id = vio_match_device(viodrv->id_table, viodev); - if (id) + if (id) { + memset(&viodev->cmo, 0, sizeof(viodev->cmo)); + if (firmware_has_feature(FW_FEATURE_CMO)) { + error = vio_cmo_bus_probe(viodev); + if (error) + return error; + } error = viodrv->probe(viodev, id); + if (error) + vio_cmo_bus_remove(viodev); + } return error; } @@ -125,12 +1125,23 @@ static int vio_bus_remove(struct device *dev) { struct vio_dev *viodev = to_vio_dev(dev); struct vio_driver *viodrv = to_vio_driver(dev->driver); + struct device *devptr; + int ret = 1; + + /* + * Hold a reference to the device after the remove function is called + * to allow for CMO accounting cleanup for the device. + */ + devptr = get_device(dev); if (viodrv->remove) - return viodrv->remove(viodev); + ret = viodrv->remove(viodev); + + if (!ret && firmware_has_feature(FW_FEATURE_CMO)) + vio_cmo_bus_remove(viodev); - /* driver can't remove */ - return 1; + put_device(devptr); + return ret; } /** @@ -215,7 +1226,11 @@ struct vio_dev *vio_register_device_node(struct device_node *of_node) viodev->unit_address = *unit_address; } viodev->dev.archdata.of_node = of_node_get(of_node); - viodev->dev.archdata.dma_ops = &dma_iommu_ops; + + if (firmware_has_feature(FW_FEATURE_CMO)) + vio_cmo_set_dma_ops(viodev); + else + viodev->dev.archdata.dma_ops = &dma_iommu_ops; viodev->dev.archdata.dma_data = vio_build_iommu_table(viodev); viodev->dev.archdata.numa_node = of_node_to_nid(of_node); @@ -245,6 +1260,9 @@ static int __init vio_bus_init(void) int err; struct device_node *node_vroot; + if (firmware_has_feature(FW_FEATURE_CMO)) + vio_cmo_sysfs_init(); + err = bus_register(&vio_bus_type); if (err) { printk(KERN_ERR "failed to register VIO bus\n"); @@ -262,6 +1280,9 @@ static int __init vio_bus_init(void) return err; } + if (firmware_has_feature(FW_FEATURE_CMO)) + vio_cmo_bus_init(); + node_vroot = of_find_node_by_name(NULL, "vdevice"); if (node_vroot) { struct device_node *of_node; diff --git a/include/asm-powerpc/vio.h b/include/asm-powerpc/vio.h index 56512a968dab..0a290a195946 100644 --- a/include/asm-powerpc/vio.h +++ b/include/asm-powerpc/vio.h @@ -39,16 +39,32 @@ #define VIO_IRQ_DISABLE 0UL #define VIO_IRQ_ENABLE 1UL +/* + * VIO CMO minimum entitlement for all devices and spare entitlement + */ +#define VIO_CMO_MIN_ENT 1562624 + struct iommu_table; -/* - * The vio_dev structure is used to describe virtual I/O devices. +/** + * vio_dev - This structure is used to describe virtual I/O devices. + * + * @desired: set from return of driver's get_desired_dma() function + * @entitled: bytes of IO data that has been reserved for this device. + * @allocated: bytes of IO data currently in use by the device. + * @allocs_failed: number of DMA failures due to insufficient entitlement. */ struct vio_dev { const char *name; const char *type; uint32_t unit_address; unsigned int irq; + struct { + size_t desired; + size_t entitled; + size_t allocated; + atomic_t allocs_failed; + } cmo; struct device dev; }; @@ -56,12 +72,19 @@ struct vio_driver { const struct vio_device_id *id_table; int (*probe)(struct vio_dev *dev, const struct vio_device_id *id); int (*remove)(struct vio_dev *dev); + /* A driver must have a get_desired_dma() function to + * be loaded in a CMO environment if it uses DMA. + */ + unsigned long (*get_desired_dma)(struct vio_dev *dev); struct device_driver driver; }; extern int vio_register_driver(struct vio_driver *drv); extern void vio_unregister_driver(struct vio_driver *drv); +extern int vio_cmo_entitlement_update(size_t); +extern void vio_cmo_set_dev_desired(struct vio_dev *viodev, size_t desired); + extern void __devinit vio_unregister_device(struct vio_dev *dev); struct device_node; -- cgit v1.2.3 From 1e3519f8e1baec0b733cd42684fcd3d9681662f1 Mon Sep 17 00:00:00 2001 From: Benjamin Herrenschmidt Date: Fri, 25 Jul 2008 16:21:11 +1000 Subject: Move update_mmu_cache() declaration from tlbflush.h to pgtable.h where it belongs. This fixes some build problems on some configs Signed-off-by: Benjamin Herrenschmidt --- include/asm-powerpc/pgtable.h | 13 +++++++++++++ include/asm-powerpc/tlbflush.h | 11 ----------- 2 files changed, 13 insertions(+), 11 deletions(-) (limited to 'include') diff --git a/include/asm-powerpc/pgtable.h b/include/asm-powerpc/pgtable.h index d18ffe7bc7c4..dbb8ca172e44 100644 --- a/include/asm-powerpc/pgtable.h +++ b/include/asm-powerpc/pgtable.h @@ -38,6 +38,19 @@ extern void paging_init(void); remap_pfn_range(vma, vaddr, pfn, size, prot) #include + + +/* + * This gets called at the end of handling a page fault, when + * the kernel has put a new PTE into the page table for the process. + * We use it to ensure coherency between the i-cache and d-cache + * for the page which has just been mapped in. + * On machines which use an MMU hash table, we use this to put a + * corresponding HPTE into the hash table ahead of time, instead of + * waiting for the inevitable extra hash-table miss exception. + */ +extern void update_mmu_cache(struct vm_area_struct *, unsigned long, pte_t); + #endif /* __ASSEMBLY__ */ #endif /* __KERNEL__ */ diff --git a/include/asm-powerpc/tlbflush.h b/include/asm-powerpc/tlbflush.h index 5c9108147644..361cd5c7a32b 100644 --- a/include/asm-powerpc/tlbflush.h +++ b/include/asm-powerpc/tlbflush.h @@ -162,16 +162,5 @@ extern void __flush_hash_table_range(struct mm_struct *mm, unsigned long start, #endif -/* - * This gets called at the end of handling a page fault, when - * the kernel has put a new PTE into the page table for the process. - * We use it to ensure coherency between the i-cache and d-cache - * for the page which has just been mapped in. - * On machines which use an MMU hash table, we use this to put a - * corresponding HPTE into the hash table ahead of time, instead of - * waiting for the inevitable extra hash-table miss exception. - */ -extern void update_mmu_cache(struct vm_area_struct *, unsigned long, pte_t); - #endif /*__KERNEL__ */ #endif /* _ASM_POWERPC_TLBFLUSH_H */ -- cgit v1.2.3 From 973b7d83ebeb1e34b8bee69208916e5f0e2353c3 Mon Sep 17 00:00:00 2001 From: Tony Breeds Date: Fri, 25 Jul 2008 16:21:51 +1000 Subject: powerpc: Wireup new syscalls signalfd4, eventfd2, epoll_create1, dup3, pipe2 and inotify_init1 Signed-off-by: Tony Breeds Signed-off-by: Benjamin Herrenschmidt --- include/asm-powerpc/syscalls.h | 1 + include/asm-powerpc/systbl.h | 6 ++++++ include/asm-powerpc/unistd.h | 8 +++++++- 3 files changed, 14 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/asm-powerpc/syscalls.h b/include/asm-powerpc/syscalls.h index 2b8a458f990a..eb8eb400c664 100644 --- a/include/asm-powerpc/syscalls.h +++ b/include/asm-powerpc/syscalls.h @@ -31,6 +31,7 @@ asmlinkage int sys_vfork(unsigned long p1, unsigned long p2, unsigned long p3, unsigned long p4, unsigned long p5, unsigned long p6, struct pt_regs *regs); asmlinkage long sys_pipe(int __user *fildes); +asmlinkage long sys_pipe2(int __user *fildes, int flags); asmlinkage long sys_rt_sigaction(int sig, const struct sigaction __user *act, struct sigaction __user *oact, size_t sigsetsize); diff --git a/include/asm-powerpc/systbl.h b/include/asm-powerpc/systbl.h index ae7085c65692..e084272ed1c2 100644 --- a/include/asm-powerpc/systbl.h +++ b/include/asm-powerpc/systbl.h @@ -316,3 +316,9 @@ COMPAT_SYS(fallocate) SYSCALL(subpage_prot) COMPAT_SYS_SPU(timerfd_settime) COMPAT_SYS_SPU(timerfd_gettime) +COMPAT_SYS_SPU(signalfd4) +SYSCALL_SPU(eventfd2) +SYSCALL_SPU(epoll_create1) +SYSCALL_SPU(dup3) +SYSCALL_SPU(pipe2) +SYSCALL(inotify_init1) diff --git a/include/asm-powerpc/unistd.h b/include/asm-powerpc/unistd.h index ce91bb662063..e07d0c76ed77 100644 --- a/include/asm-powerpc/unistd.h +++ b/include/asm-powerpc/unistd.h @@ -335,10 +335,16 @@ #define __NR_subpage_prot 310 #define __NR_timerfd_settime 311 #define __NR_timerfd_gettime 312 +#define __NR_signalfd4 313 +#define __NR_eventfd2 314 +#define __NR_epoll_create1 315 +#define __NR_dup3 316 +#define __NR_pipe2 317 +#define __NR_inotify_init1 318 #ifdef __KERNEL__ -#define __NR_syscalls 313 +#define __NR_syscalls 319 #define __NR__exit __NR_exit #define NR_syscalls __NR_syscalls -- cgit v1.2.3